Fix sentence_cut return value in inappropriate places

This commit is contained in:
Leon Luithlen 2024-11-14 14:40:42 +01:00
parent b4d509e682
commit 73f24f9e4d
4 changed files with 19 additions and 9 deletions

View file

@ -58,7 +58,6 @@ def chunk_by_paragraph(data: str, paragraph_length: int = 1024, batch_paragraphs
chunk_index += 1 chunk_index += 1
last_cut_type = end_type last_cut_type = end_type
last_paragraph_id = paragraph_id
# Yield any remaining text # Yield any remaining text
if current_chunk: if current_chunk:

View file

@ -10,12 +10,24 @@ def chunk_by_sentence(data: str, maximum_length: Optional[int] = None):
paragraph_id = uuid4() paragraph_id = uuid4()
chunk_index = 0 chunk_index = 0
word_count = 0 word_count = 0
section_end = False
for (word, word_type) in chunk_by_word(data): for (word, word_type) in chunk_by_word(data):
sentence += word sentence += word
word_count += 1 word_count += 1
if word_type == "paragraph_end" or word_type == "sentence_end" or (maximum_length and (word_count == maximum_length)): # this loop is to check if any letters come after a paragraph_end or sentence_end
# and if that is not the case, preserve the word_type for the final yield in the
# function
if word_type in ["paragraph_end", "sentence_end"]:
section_end = word_type
else:
for character in word:
if character.isalpha():
section_end = "sentence_cut"
break
if word_type in ["paragraph_end", "sentence_end"] or (maximum_length and (word_count == maximum_length)):
yield (paragraph_id, chunk_index, sentence, word_count, word_type) yield (paragraph_id, chunk_index, sentence, word_count, word_type)
sentence = "" sentence = ""
word_count = 0 word_count = 0
@ -28,5 +40,5 @@ def chunk_by_sentence(data: str, maximum_length: Optional[int] = None):
chunk_index, chunk_index,
sentence, sentence,
word_count, word_count,
"sentence_cut", section_end,
) )

View file

@ -6,7 +6,7 @@ from cognee.modules.data.processing.document_types.AudioDocument import AudioDoc
GROUND_TRUTH = [ GROUND_TRUTH = [
{"word_count": 57, "len_text": 353, "cut_type": "sentence_end"}, {"word_count": 57, "len_text": 353, "cut_type": "sentence_end"},
{"word_count": 58, "len_text": 358, "cut_type": "sentence_end"}, {"word_count": 58, "len_text": 358, "cut_type": "sentence_end"},
{"word_count": 41, "len_text": 220, "cut_type": "sentence_cut"}, {"word_count": 41, "len_text": 219, "cut_type": "sentence_end"},
] ]
TEST_TEXT = """ TEST_TEXT = """
@ -21,8 +21,7 @@ TEST_TEXT = """
"Then you'll ship it without me. I won't stake my reputation on a house of cards." "Then you'll ship it without me. I won't stake my reputation on a house of cards."
"Are you threatening to quit?" "Are you threatening to quit?"
"No, I'm threatening to be right. And when it breaks, I want it in writing that you chose this." "No, I'm threatening to be right. And when it breaks, I want it in writing that you chose this."
"The feature ships, Sarah. That's final." "The feature ships, Sarah. That's final.\""""
"""
def test_AudioDocument(): def test_AudioDocument():
@ -31,7 +30,7 @@ def test_AudioDocument():
id=uuid.uuid4(), name="audio-dummy-test", raw_data_location="" id=uuid.uuid4(), name="audio-dummy-test", raw_data_location=""
) )
with patch.object(AudioDocument, "create_transcript", return_value=TEST_TEXT): with patch.object(AudioDocument, "create_transcript", return_value=TEST_TEXT):
l = list(document.read(chunk_size=64))
for ground_truth, paragraph_data in zip( for ground_truth, paragraph_data in zip(
GROUND_TRUTH, document.read(chunk_size=64) GROUND_TRUTH, document.read(chunk_size=64)
): ):

View file

@ -8,11 +8,11 @@ from cognee.modules.data.processing.document_types.TextDocument import TextDocum
GROUND_TRUTH = { GROUND_TRUTH = {
"code.txt": [ "code.txt": [
{"word_count": 205, "len_text": 1024, "cut_type": "sentence_cut"}, {"word_count": 205, "len_text": 1024, "cut_type": "sentence_cut"},
{"word_count": 104, "len_text": 833, "cut_type": "sentence_cut"}, {"word_count": 104, "len_text": 833, "cut_type": "paragraph_end"},
], ],
"Natural_language_processing.txt": [ "Natural_language_processing.txt": [
{"word_count": 128, "len_text": 984, "cut_type": "paragraph_end"}, {"word_count": 128, "len_text": 984, "cut_type": "paragraph_end"},
{"word_count": 1, "len_text": 1, "cut_type": "sentence_cut"}, {"word_count": 1, "len_text": 1, "cut_type": "paragraph_end"},
], ],
} }