Fix sentence_cut return value in inappropriate places

This commit is contained in:
Leon Luithlen 2024-11-14 14:40:42 +01:00
parent b4d509e682
commit 73f24f9e4d
4 changed files with 19 additions and 9 deletions

View file

@ -58,7 +58,6 @@ def chunk_by_paragraph(data: str, paragraph_length: int = 1024, batch_paragraphs
chunk_index += 1
last_cut_type = end_type
last_paragraph_id = paragraph_id
# Yield any remaining text
if current_chunk:

View file

@ -10,12 +10,24 @@ def chunk_by_sentence(data: str, maximum_length: Optional[int] = None):
paragraph_id = uuid4()
chunk_index = 0
word_count = 0
section_end = False
for (word, word_type) in chunk_by_word(data):
sentence += word
word_count += 1
if word_type == "paragraph_end" or word_type == "sentence_end" or (maximum_length and (word_count == maximum_length)):
# this loop is to check if any letters come after a paragraph_end or sentence_end
# and if that is not the case, preserve the word_type for the final yield in the
# function
if word_type in ["paragraph_end", "sentence_end"]:
section_end = word_type
else:
for character in word:
if character.isalpha():
section_end = "sentence_cut"
break
if word_type in ["paragraph_end", "sentence_end"] or (maximum_length and (word_count == maximum_length)):
yield (paragraph_id, chunk_index, sentence, word_count, word_type)
sentence = ""
word_count = 0
@ -28,5 +40,5 @@ def chunk_by_sentence(data: str, maximum_length: Optional[int] = None):
chunk_index,
sentence,
word_count,
"sentence_cut",
section_end,
)

View file

@ -6,7 +6,7 @@ from cognee.modules.data.processing.document_types.AudioDocument import AudioDoc
GROUND_TRUTH = [
{"word_count": 57, "len_text": 353, "cut_type": "sentence_end"},
{"word_count": 58, "len_text": 358, "cut_type": "sentence_end"},
{"word_count": 41, "len_text": 220, "cut_type": "sentence_cut"},
{"word_count": 41, "len_text": 219, "cut_type": "sentence_end"},
]
TEST_TEXT = """
@ -21,8 +21,7 @@ TEST_TEXT = """
"Then you'll ship it without me. I won't stake my reputation on a house of cards."
"Are you threatening to quit?"
"No, I'm threatening to be right. And when it breaks, I want it in writing that you chose this."
"The feature ships, Sarah. That's final."
"""
"The feature ships, Sarah. That's final.\""""
def test_AudioDocument():
@ -31,7 +30,7 @@ def test_AudioDocument():
id=uuid.uuid4(), name="audio-dummy-test", raw_data_location=""
)
with patch.object(AudioDocument, "create_transcript", return_value=TEST_TEXT):
l = list(document.read(chunk_size=64))
for ground_truth, paragraph_data in zip(
GROUND_TRUTH, document.read(chunk_size=64)
):

View file

@ -8,11 +8,11 @@ from cognee.modules.data.processing.document_types.TextDocument import TextDocum
GROUND_TRUTH = {
"code.txt": [
{"word_count": 205, "len_text": 1024, "cut_type": "sentence_cut"},
{"word_count": 104, "len_text": 833, "cut_type": "sentence_cut"},
{"word_count": 104, "len_text": 833, "cut_type": "paragraph_end"},
],
"Natural_language_processing.txt": [
{"word_count": 128, "len_text": 984, "cut_type": "paragraph_end"},
{"word_count": 1, "len_text": 1, "cut_type": "sentence_cut"},
{"word_count": 1, "len_text": 1, "cut_type": "paragraph_end"},
],
}