test: Update document tests regrading max chunk tokens
This commit is contained in:
parent
e0b7be7cf0
commit
dc0450d30e
5 changed files with 17 additions and 8 deletions
|
|
@ -34,7 +34,7 @@ def test_AudioDocument():
|
|||
)
|
||||
with patch.object(AudioDocument, "create_transcript", return_value=TEST_TEXT):
|
||||
for ground_truth, paragraph_data in zip(
|
||||
GROUND_TRUTH, document.read(chunk_size=64, chunker="text_chunker")
|
||||
GROUND_TRUTH, document.read(chunk_size=64, chunker="text_chunker", max_chunk_tokens=512)
|
||||
):
|
||||
assert ground_truth["word_count"] == paragraph_data.word_count, (
|
||||
f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }'
|
||||
|
|
|
|||
|
|
@ -23,7 +23,7 @@ def test_ImageDocument():
|
|||
)
|
||||
with patch.object(ImageDocument, "transcribe_image", return_value=TEST_TEXT):
|
||||
for ground_truth, paragraph_data in zip(
|
||||
GROUND_TRUTH, document.read(chunk_size=64, chunker="text_chunker")
|
||||
GROUND_TRUTH, document.read(chunk_size=64, chunker="text_chunker", max_chunk_tokens=512)
|
||||
):
|
||||
assert ground_truth["word_count"] == paragraph_data.word_count, (
|
||||
f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }'
|
||||
|
|
|
|||
|
|
@ -25,7 +25,7 @@ def test_PdfDocument():
|
|||
)
|
||||
|
||||
for ground_truth, paragraph_data in zip(
|
||||
GROUND_TRUTH, document.read(chunk_size=1024, chunker="text_chunker")
|
||||
GROUND_TRUTH, document.read(chunk_size=1024, chunker="text_chunker", max_chunk_tokens=2048)
|
||||
):
|
||||
assert ground_truth["word_count"] == paragraph_data.word_count, (
|
||||
f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }'
|
||||
|
|
|
|||
|
|
@ -37,7 +37,8 @@ def test_TextDocument(input_file, chunk_size):
|
|||
)
|
||||
|
||||
for ground_truth, paragraph_data in zip(
|
||||
GROUND_TRUTH[input_file], document.read(chunk_size=chunk_size, chunker="text_chunker")
|
||||
GROUND_TRUTH[input_file],
|
||||
document.read(chunk_size=chunk_size, chunker="text_chunker", max_chunk_tokens=1024),
|
||||
):
|
||||
assert ground_truth["word_count"] == paragraph_data.word_count, (
|
||||
f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }'
|
||||
|
|
|
|||
|
|
@ -68,7 +68,9 @@ def test_UnstructuredDocument():
|
|||
)
|
||||
|
||||
# Test PPTX
|
||||
for paragraph_data in pptx_document.read(chunk_size=1024, chunker="text_chunker"):
|
||||
for paragraph_data in pptx_document.read(
|
||||
chunk_size=1024, chunker="text_chunker", max_chunk_tokens=1024
|
||||
):
|
||||
assert 19 == paragraph_data.word_count, f" 19 != {paragraph_data.word_count = }"
|
||||
assert 104 == len(paragraph_data.text), f" 104 != {len(paragraph_data.text) = }"
|
||||
assert "sentence_cut" == paragraph_data.cut_type, (
|
||||
|
|
@ -76,7 +78,9 @@ def test_UnstructuredDocument():
|
|||
)
|
||||
|
||||
# Test DOCX
|
||||
for paragraph_data in docx_document.read(chunk_size=1024, chunker="text_chunker"):
|
||||
for paragraph_data in docx_document.read(
|
||||
chunk_size=1024, chunker="text_chunker", max_chunk_tokens=1024
|
||||
):
|
||||
assert 16 == paragraph_data.word_count, f" 16 != {paragraph_data.word_count = }"
|
||||
assert 145 == len(paragraph_data.text), f" 145 != {len(paragraph_data.text) = }"
|
||||
assert "sentence_end" == paragraph_data.cut_type, (
|
||||
|
|
@ -84,7 +88,9 @@ def test_UnstructuredDocument():
|
|||
)
|
||||
|
||||
# TEST CSV
|
||||
for paragraph_data in csv_document.read(chunk_size=1024, chunker="text_chunker"):
|
||||
for paragraph_data in csv_document.read(
|
||||
chunk_size=1024, chunker="text_chunker", max_chunk_tokens=1024
|
||||
):
|
||||
assert 15 == paragraph_data.word_count, f" 15 != {paragraph_data.word_count = }"
|
||||
assert "A A A A A A A A A,A A A A A A,A A" == paragraph_data.text, (
|
||||
f"Read text doesn't match expected text: {paragraph_data.text}"
|
||||
|
|
@ -94,7 +100,9 @@ def test_UnstructuredDocument():
|
|||
)
|
||||
|
||||
# Test XLSX
|
||||
for paragraph_data in xlsx_document.read(chunk_size=1024, chunker="text_chunker"):
|
||||
for paragraph_data in xlsx_document.read(
|
||||
chunk_size=1024, chunker="text_chunker", max_chunk_tokens=1024
|
||||
):
|
||||
assert 36 == paragraph_data.word_count, f" 36 != {paragraph_data.word_count = }"
|
||||
assert 171 == len(paragraph_data.text), f" 171 != {len(paragraph_data.text) = }"
|
||||
assert "sentence_cut" == paragraph_data.cut_type, (
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue