<!-- .github/pull_request_template.md --> ## Description <!-- Provide a clear description of the changes in this PR --> ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit - **New Features** - Introduced a modular content chunking interface that offers flexible text segmentation with configurable chunk size and overlap. - Added new chunkers for enhanced text processing, including `LangchainChunker` and improved `TextChunker`. - **Refactor** - Unified the chunk extraction mechanism across various document types for improved consistency and type safety. - Updated method signatures to enhance clarity and type safety regarding chunker usage. - Enhanced error handling and logging during text segmentation to guide adjustments when content exceeds limits. - **Bug Fixes** - Adjusted expected output in tests to reflect changes in chunking logic and configurations. <!-- end of auto-generated comment: release notes by coderabbit.ai -->
51 lines
1.7 KiB
Python
51 lines
1.7 KiB
Python
import os
|
|
import uuid
|
|
|
|
import pytest
|
|
from cognee.modules.chunking.TextChunker import TextChunker
|
|
from cognee.modules.data.processing.document_types.TextDocument import TextDocument
|
|
|
|
GROUND_TRUTH = {
|
|
"code.txt": [
|
|
{"word_count": 252, "len_text": 1376, "cut_type": "paragraph_end"},
|
|
{"word_count": 56, "len_text": 481, "cut_type": "paragraph_end"},
|
|
],
|
|
"Natural_language_processing.txt": [
|
|
{"word_count": 128, "len_text": 984, "cut_type": "paragraph_end"},
|
|
{"word_count": 1, "len_text": 1, "cut_type": "paragraph_end"},
|
|
],
|
|
}
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"input_file,chunk_size",
|
|
[("code.txt", 256), ("Natural_language_processing.txt", 128)],
|
|
)
|
|
def test_TextDocument(input_file, chunk_size):
|
|
test_file_path = os.path.join(
|
|
os.sep,
|
|
*(os.path.dirname(__file__).split(os.sep)[:-2]),
|
|
"test_data",
|
|
input_file,
|
|
)
|
|
document = TextDocument(
|
|
id=uuid.uuid4(),
|
|
name=input_file,
|
|
raw_data_location=test_file_path,
|
|
external_metadata="",
|
|
mime_type="",
|
|
)
|
|
|
|
for ground_truth, paragraph_data in zip(
|
|
GROUND_TRUTH[input_file],
|
|
document.read(chunk_size=chunk_size, chunker_cls=TextChunker, max_chunk_tokens=1024),
|
|
):
|
|
assert ground_truth["word_count"] == paragraph_data.word_count, (
|
|
f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }'
|
|
)
|
|
assert ground_truth["len_text"] == len(paragraph_data.text), (
|
|
f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }'
|
|
)
|
|
assert ground_truth["cut_type"] == paragraph_data.cut_type, (
|
|
f'{ground_truth["cut_type"] = } != {paragraph_data.cut_type = }'
|
|
)
|