Fix sentence_cut return value in inappropriate places
This commit is contained in:
parent
b4d509e682
commit
73f24f9e4d
4 changed files with 19 additions and 9 deletions
|
|
@ -58,7 +58,6 @@ def chunk_by_paragraph(data: str, paragraph_length: int = 1024, batch_paragraphs
|
|||
chunk_index += 1
|
||||
|
||||
last_cut_type = end_type
|
||||
last_paragraph_id = paragraph_id
|
||||
|
||||
# Yield any remaining text
|
||||
if current_chunk:
|
||||
|
|
|
|||
|
|
@ -10,12 +10,24 @@ def chunk_by_sentence(data: str, maximum_length: Optional[int] = None):
|
|||
paragraph_id = uuid4()
|
||||
chunk_index = 0
|
||||
word_count = 0
|
||||
section_end = False
|
||||
|
||||
for (word, word_type) in chunk_by_word(data):
|
||||
sentence += word
|
||||
word_count += 1
|
||||
|
||||
if word_type == "paragraph_end" or word_type == "sentence_end" or (maximum_length and (word_count == maximum_length)):
|
||||
# this loop is to check if any letters come after a paragraph_end or sentence_end
|
||||
# and if that is not the case, preserve the word_type for the final yield in the
|
||||
# function
|
||||
if word_type in ["paragraph_end", "sentence_end"]:
|
||||
section_end = word_type
|
||||
else:
|
||||
for character in word:
|
||||
if character.isalpha():
|
||||
section_end = "sentence_cut"
|
||||
break
|
||||
|
||||
if word_type in ["paragraph_end", "sentence_end"] or (maximum_length and (word_count == maximum_length)):
|
||||
yield (paragraph_id, chunk_index, sentence, word_count, word_type)
|
||||
sentence = ""
|
||||
word_count = 0
|
||||
|
|
@ -28,5 +40,5 @@ def chunk_by_sentence(data: str, maximum_length: Optional[int] = None):
|
|||
chunk_index,
|
||||
sentence,
|
||||
word_count,
|
||||
"sentence_cut",
|
||||
section_end,
|
||||
)
|
||||
|
|
|
|||
|
|
@ -6,7 +6,7 @@ from cognee.modules.data.processing.document_types.AudioDocument import AudioDoc
|
|||
GROUND_TRUTH = [
|
||||
{"word_count": 57, "len_text": 353, "cut_type": "sentence_end"},
|
||||
{"word_count": 58, "len_text": 358, "cut_type": "sentence_end"},
|
||||
{"word_count": 41, "len_text": 220, "cut_type": "sentence_cut"},
|
||||
{"word_count": 41, "len_text": 219, "cut_type": "sentence_end"},
|
||||
]
|
||||
|
||||
TEST_TEXT = """
|
||||
|
|
@ -21,8 +21,7 @@ TEST_TEXT = """
|
|||
"Then you'll ship it without me. I won't stake my reputation on a house of cards."
|
||||
"Are you threatening to quit?"
|
||||
"No, I'm threatening to be right. And when it breaks, I want it in writing that you chose this."
|
||||
"The feature ships, Sarah. That's final."
|
||||
"""
|
||||
"The feature ships, Sarah. That's final.\""""
|
||||
|
||||
|
||||
def test_AudioDocument():
|
||||
|
|
@ -31,7 +30,7 @@ def test_AudioDocument():
|
|||
id=uuid.uuid4(), name="audio-dummy-test", raw_data_location=""
|
||||
)
|
||||
with patch.object(AudioDocument, "create_transcript", return_value=TEST_TEXT):
|
||||
|
||||
l = list(document.read(chunk_size=64))
|
||||
for ground_truth, paragraph_data in zip(
|
||||
GROUND_TRUTH, document.read(chunk_size=64)
|
||||
):
|
||||
|
|
|
|||
|
|
@ -8,11 +8,11 @@ from cognee.modules.data.processing.document_types.TextDocument import TextDocum
|
|||
GROUND_TRUTH = {
|
||||
"code.txt": [
|
||||
{"word_count": 205, "len_text": 1024, "cut_type": "sentence_cut"},
|
||||
{"word_count": 104, "len_text": 833, "cut_type": "sentence_cut"},
|
||||
{"word_count": 104, "len_text": 833, "cut_type": "paragraph_end"},
|
||||
],
|
||||
"Natural_language_processing.txt": [
|
||||
{"word_count": 128, "len_text": 984, "cut_type": "paragraph_end"},
|
||||
{"word_count": 1, "len_text": 1, "cut_type": "sentence_cut"},
|
||||
{"word_count": 1, "len_text": 1, "cut_type": "paragraph_end"},
|
||||
],
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue