Fix chunk_by_word_test
This commit is contained in:
parent
98cbaaff68
commit
830c6710e0
1 changed files with 6 additions and 1 deletions
|
|
@ -9,12 +9,17 @@ from cognee.tests.unit.processing.chunks.test_input import INPUT_TEXTS
|
||||||
INPUT_TEXTS["python_code"],
|
INPUT_TEXTS["python_code"],
|
||||||
INPUT_TEXTS["chinese_text"]
|
INPUT_TEXTS["chinese_text"]
|
||||||
])
|
])
|
||||||
|
|
||||||
def test_chunk_by_word_isomorphism(input_text):
|
def test_chunk_by_word_isomorphism(input_text):
|
||||||
chunks = chunk_by_word(input_text)
|
chunks = chunk_by_word(input_text)
|
||||||
reconstructed_text = "".join([chunk[0] for chunk in chunks])
|
reconstructed_text = "".join([chunk[0] for chunk in chunks])
|
||||||
assert reconstructed_text == input_text, f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }"
|
assert reconstructed_text == input_text, f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }"
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("input_text", [
|
||||||
|
INPUT_TEXTS["english_text"],
|
||||||
|
INPUT_TEXTS["english_lists"],
|
||||||
|
INPUT_TEXTS["python_code"],
|
||||||
|
INPUT_TEXTS["chinese_text"]
|
||||||
|
])
|
||||||
def test_chunk_by_word_splits(input_text):
|
def test_chunk_by_word_splits(input_text):
|
||||||
chunks = np.array(list(chunk_by_word(input_text)))
|
chunks = np.array(list(chunk_by_word(input_text)))
|
||||||
space_test = np.array([" " not in chunk[0].strip() for chunk in chunks])
|
space_test = np.array([" " not in chunk[0].strip() for chunk in chunks])
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue