cognee/cognee/tests/integration/documents/TextDocument_test.py
alekszievr a61df966c6
feat: use external chunker [cog-1354] (#551)
<!-- .github/pull_request_template.md -->

## Description
<!-- Provide a clear description of the changes in this PR -->

## DCO Affirmation
I affirm that all code in every commit of this pull request conforms to
the terms of the Topoteretes Developer Certificate of Origin


<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
## Summary by CodeRabbit

- **New Features**
- Introduced a modular content chunking interface that offers flexible
text segmentation with configurable chunk size and overlap.
- Added new chunkers for enhanced text processing, including
`LangchainChunker` and improved `TextChunker`.

- **Refactor**
- Unified the chunk extraction mechanism across various document types
for improved consistency and type safety.
- Updated method signatures to enhance clarity and type safety regarding
chunker usage.
- Enhanced error handling and logging during text segmentation to guide
adjustments when content exceeds limits.

- **Bug Fixes**
- Adjusted expected output in tests to reflect changes in chunking logic
and configurations.
<!-- end of auto-generated comment: release notes by coderabbit.ai -->
2025-02-21 14:10:59 +01:00

51 lines
1.7 KiB
Python

import os
import uuid
import pytest
from cognee.modules.chunking.TextChunker import TextChunker
from cognee.modules.data.processing.document_types.TextDocument import TextDocument
GROUND_TRUTH = {
"code.txt": [
{"word_count": 252, "len_text": 1376, "cut_type": "paragraph_end"},
{"word_count": 56, "len_text": 481, "cut_type": "paragraph_end"},
],
"Natural_language_processing.txt": [
{"word_count": 128, "len_text": 984, "cut_type": "paragraph_end"},
{"word_count": 1, "len_text": 1, "cut_type": "paragraph_end"},
],
}
@pytest.mark.parametrize(
"input_file,chunk_size",
[("code.txt", 256), ("Natural_language_processing.txt", 128)],
)
def test_TextDocument(input_file, chunk_size):
test_file_path = os.path.join(
os.sep,
*(os.path.dirname(__file__).split(os.sep)[:-2]),
"test_data",
input_file,
)
document = TextDocument(
id=uuid.uuid4(),
name=input_file,
raw_data_location=test_file_path,
external_metadata="",
mime_type="",
)
for ground_truth, paragraph_data in zip(
GROUND_TRUTH[input_file],
document.read(chunk_size=chunk_size, chunker_cls=TextChunker, max_chunk_tokens=1024),
):
assert ground_truth["word_count"] == paragraph_data.word_count, (
f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }'
)
assert ground_truth["len_text"] == len(paragraph_data.text), (
f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }'
)
assert ground_truth["cut_type"] == paragraph_data.cut_type, (
f'{ground_truth["cut_type"] = } != {paragraph_data.cut_type = }'
)