Add TextDocument_test.py
This commit is contained in:
parent
8afb25e0d4
commit
e6636754ff
2 changed files with 48 additions and 2 deletions
|
|
@ -16,12 +16,12 @@ def test_PdfDocument():
|
||||||
"test_data",
|
"test_data",
|
||||||
"artificial-intelligence.pdf",
|
"artificial-intelligence.pdf",
|
||||||
)
|
)
|
||||||
pdf_doc = PdfDocument(
|
document = PdfDocument(
|
||||||
id=uuid.uuid4(), name="Test document.pdf", raw_data_location=test_file_path
|
id=uuid.uuid4(), name="Test document.pdf", raw_data_location=test_file_path
|
||||||
)
|
)
|
||||||
|
|
||||||
for ground_truth, paragraph_data in zip(
|
for ground_truth, paragraph_data in zip(
|
||||||
GROUND_TRUTH, pdf_doc.read(chunk_size=1024)
|
GROUND_TRUTH, document.read(chunk_size=1024)
|
||||||
):
|
):
|
||||||
assert (
|
assert (
|
||||||
ground_truth["word_count"] == paragraph_data.word_count
|
ground_truth["word_count"] == paragraph_data.word_count
|
||||||
|
|
|
||||||
46
cognee/tests/integration/documents/TextDocument_test.py
Normal file
46
cognee/tests/integration/documents/TextDocument_test.py
Normal file
|
|
@ -0,0 +1,46 @@
|
||||||
|
import os
|
||||||
|
import uuid
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from cognee.modules.data.processing.document_types.TextDocument import TextDocument
|
||||||
|
|
||||||
|
GROUND_TRUTH = {
|
||||||
|
"code.txt": [
|
||||||
|
{"word_count": 253, "len_text": 953, "cut_type": "paragraph_end"},
|
||||||
|
{"word_count": 157, "len_text": 905, "cut_type": "paragraph_end"},
|
||||||
|
],
|
||||||
|
"Natural_language_processing.txt": [
|
||||||
|
{"word_count": 115, "len_text": 839, "cut_type": "paragraph_end"},
|
||||||
|
{"word_count": 15, "len_text": 146, "cut_type": "paragraph_end"},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"input_file,chunk_size",
|
||||||
|
[("code.txt", 256), ("Natural_language_processing.txt", 128)],
|
||||||
|
)
|
||||||
|
def test_TextDocument(input_file, chunk_size):
|
||||||
|
test_file_path = os.path.join(
|
||||||
|
os.sep,
|
||||||
|
*(os.path.dirname(__file__).split(os.sep)[:-2]),
|
||||||
|
"test_data",
|
||||||
|
input_file,
|
||||||
|
)
|
||||||
|
document = TextDocument(
|
||||||
|
id=uuid.uuid4(), name=input_file, raw_data_location=test_file_path
|
||||||
|
)
|
||||||
|
|
||||||
|
for ground_truth, paragraph_data in zip(
|
||||||
|
GROUND_TRUTH[input_file], document.read(chunk_size=chunk_size)
|
||||||
|
):
|
||||||
|
assert (
|
||||||
|
ground_truth["word_count"] == paragraph_data.word_count
|
||||||
|
), f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }'
|
||||||
|
assert ground_truth["len_text"] == len(
|
||||||
|
paragraph_data.text
|
||||||
|
), f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }'
|
||||||
|
assert (
|
||||||
|
ground_truth["cut_type"] == paragraph_data.cut_type
|
||||||
|
), f'{ground_truth["cut_type"] = } != {paragraph_data.cut_type = }'
|
||||||
Loading…
Add table
Reference in a new issue