<!-- .github/pull_request_template.md --> ## Description <!-- Provide a clear description of the changes in this PR --> ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit - **Refactor** - Simplified text processing by unifying multiple size-related parameters into a single metric across chunking and extraction functionalities. - Streamlined logic for text segmentation by removing redundant calculations and checks, resulting in a more consistent chunk management process. - **Chores** - Removed the `modal` package as a dependency. - **Documentation** - Updated the README.md to include a new demo video link and clarified default environment variable settings. - Enhanced the CONTRIBUTING.md to improve clarity and engagement for potential contributors. - **Bug Fixes** - Improved handling of sentence-ending punctuation in text processing to include additional characters. - **Version Update** - Updated project version to 0.1.33 in the pyproject.toml file. <!-- end of auto-generated comment: release notes by coderabbit.ai -->
59 lines
1.8 KiB
Python
59 lines
1.8 KiB
Python
from itertools import product
|
|
|
|
import numpy as np
|
|
import pytest
|
|
|
|
from cognee.tasks.chunks import chunk_by_sentence
|
|
from cognee.tests.unit.processing.chunks.test_input import INPUT_TEXTS_LONGWORDS, INPUT_TEXTS
|
|
from cognee.infrastructure.databases.vector.embeddings import get_embedding_engine
|
|
|
|
maximum_length_vals = [None, 16, 64]
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"input_text,maximum_length",
|
|
list(product(list(INPUT_TEXTS.values()), maximum_length_vals)),
|
|
)
|
|
def test_chunk_by_sentence_isomorphism(input_text, maximum_length):
|
|
chunks = chunk_by_sentence(input_text, maximum_length)
|
|
reconstructed_text = "".join([chunk[1] for chunk in chunks])
|
|
assert reconstructed_text == input_text, (
|
|
f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }"
|
|
)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"input_text,maximum_length",
|
|
list(
|
|
product(
|
|
list(INPUT_TEXTS.values()),
|
|
[val for val in maximum_length_vals if val is not None],
|
|
)
|
|
),
|
|
)
|
|
def test_paragraph_chunk_length(input_text, maximum_length):
|
|
chunks = list(chunk_by_sentence(input_text, maximum_length))
|
|
|
|
embedding_engine = get_embedding_engine()
|
|
chunk_lengths = np.array(
|
|
[embedding_engine.tokenizer.count_tokens(chunk[1]) for chunk in chunks]
|
|
)
|
|
|
|
larger_chunks = chunk_lengths[chunk_lengths > maximum_length]
|
|
assert np.all(chunk_lengths <= maximum_length), (
|
|
f"{maximum_length = }: {larger_chunks} are too large"
|
|
)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"input_text,maximum_length",
|
|
list(
|
|
product(
|
|
list(INPUT_TEXTS_LONGWORDS.values()),
|
|
[val for val in maximum_length_vals if val is not None],
|
|
)
|
|
),
|
|
)
|
|
def test_paragraph_chunk_long_input(input_text, maximum_length):
|
|
with pytest.raises(ValueError):
|
|
list(chunk_by_sentence(input_text, maximum_length))
|