cognee/cognee/tests/unit/processing/chunks/chunk_by_sentence_test.py
alekszievr c1f7b667d1
feat: Eliminate the use of max_chunk_tokens and use a unified max_chunk_size instead [cog-1381] (#626)
<!-- .github/pull_request_template.md -->

## Description
<!-- Provide a clear description of the changes in this PR -->

## DCO Affirmation
I affirm that all code in every commit of this pull request conforms to
the terms of the Topoteretes Developer Certificate of Origin


<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
## Summary by CodeRabbit

- **Refactor**
- Simplified text processing by unifying multiple size-related
parameters into a single metric across chunking and extraction
functionalities.
- Streamlined logic for text segmentation by removing redundant
calculations and checks, resulting in a more consistent chunk management
process.
- **Chores**
  - Removed the `modal` package as a dependency.
- **Documentation**
- Updated the README.md to include a new demo video link and clarified
default environment variable settings.
- Enhanced the CONTRIBUTING.md to improve clarity and engagement for
potential contributors.
- **Bug Fixes**
- Improved handling of sentence-ending punctuation in text processing to
include additional characters.
- **Version Update**
  - Updated project version to 0.1.33 in the pyproject.toml file.
<!-- end of auto-generated comment: release notes by coderabbit.ai -->
2025-03-12 14:03:41 +01:00

59 lines
1.8 KiB
Python

from itertools import product
import numpy as np
import pytest
from cognee.tasks.chunks import chunk_by_sentence
from cognee.tests.unit.processing.chunks.test_input import INPUT_TEXTS_LONGWORDS, INPUT_TEXTS
from cognee.infrastructure.databases.vector.embeddings import get_embedding_engine
maximum_length_vals = [None, 16, 64]
@pytest.mark.parametrize(
"input_text,maximum_length",
list(product(list(INPUT_TEXTS.values()), maximum_length_vals)),
)
def test_chunk_by_sentence_isomorphism(input_text, maximum_length):
chunks = chunk_by_sentence(input_text, maximum_length)
reconstructed_text = "".join([chunk[1] for chunk in chunks])
assert reconstructed_text == input_text, (
f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }"
)
@pytest.mark.parametrize(
"input_text,maximum_length",
list(
product(
list(INPUT_TEXTS.values()),
[val for val in maximum_length_vals if val is not None],
)
),
)
def test_paragraph_chunk_length(input_text, maximum_length):
chunks = list(chunk_by_sentence(input_text, maximum_length))
embedding_engine = get_embedding_engine()
chunk_lengths = np.array(
[embedding_engine.tokenizer.count_tokens(chunk[1]) for chunk in chunks]
)
larger_chunks = chunk_lengths[chunk_lengths > maximum_length]
assert np.all(chunk_lengths <= maximum_length), (
f"{maximum_length = }: {larger_chunks} are too large"
)
@pytest.mark.parametrize(
"input_text,maximum_length",
list(
product(
list(INPUT_TEXTS_LONGWORDS.values()),
[val for val in maximum_length_vals if val is not None],
)
),
)
def test_paragraph_chunk_long_input(input_text, maximum_length):
with pytest.raises(ValueError):
list(chunk_by_sentence(input_text, maximum_length))