From 6f0637a02849a8c457b9a8619c9da37208404438 Mon Sep 17 00:00:00 2001 From: Leon Luithlen Date: Wed, 13 Nov 2024 11:30:30 +0100 Subject: [PATCH 01/39] Small cosmetic changes --- cognee/modules/chunking/TextChunker.py | 20 ++++++++++---------- cognee/tasks/chunks/chunk_by_word.py | 21 ++++++++++----------- 2 files changed, 20 insertions(+), 21 deletions(-) diff --git a/cognee/modules/chunking/TextChunker.py b/cognee/modules/chunking/TextChunker.py index 714383804..f0a72b58a 100644 --- a/cognee/modules/chunking/TextChunker.py +++ b/cognee/modules/chunking/TextChunker.py @@ -9,7 +9,6 @@ class TextChunker(): chunk_index = 0 chunk_size = 0 - paragraph_chunks = [] def __init__(self, document, get_text: callable, chunk_size: int = 1024): self.document = document @@ -17,6 +16,7 @@ class TextChunker(): self.get_text = get_text def read(self): + paragraph_chunks = [] for content_text in self.get_text(): for chunk_data in chunk_by_paragraph( content_text, @@ -24,10 +24,10 @@ class TextChunker(): batch_paragraphs = True, ): if self.chunk_size + chunk_data["word_count"] <= self.max_chunk_size: - self.paragraph_chunks.append(chunk_data) + paragraph_chunks.append(chunk_data) self.chunk_size += chunk_data["word_count"] else: - if len(self.paragraph_chunks) == 0: + if len(paragraph_chunks) == 0: yield DocumentChunk( id = chunk_data["chunk_id"], text = chunk_data["text"], @@ -36,10 +36,10 @@ class TextChunker(): chunk_index = self.chunk_index, cut_type = chunk_data["cut_type"], ) - self.paragraph_chunks = [] + paragraph_chunks = [] self.chunk_size = 0 else: - chunk_text = " ".join(chunk["text"] for chunk in self.paragraph_chunks) + chunk_text = " ".join(chunk["text"] for chunk in paragraph_chunks) try: yield DocumentChunk( id = uuid5(NAMESPACE_OID, f"{str(self.document.id)}-{self.chunk_index}"), @@ -47,24 +47,24 @@ class TextChunker(): word_count = self.chunk_size, is_part_of = self.document, chunk_index = self.chunk_index, - cut_type = self.paragraph_chunks[len(self.paragraph_chunks) - 1]["cut_type"], + cut_type = paragraph_chunks[len(paragraph_chunks) - 1]["cut_type"], ) except Exception as e: print(e) - self.paragraph_chunks = [chunk_data] + paragraph_chunks = [chunk_data] self.chunk_size = chunk_data["word_count"] self.chunk_index += 1 - if len(self.paragraph_chunks) > 0: + if len(paragraph_chunks) > 0: try: yield DocumentChunk( id = uuid5(NAMESPACE_OID, f"{str(self.document.id)}-{self.chunk_index}"), - text = " ".join(chunk["text"] for chunk in self.paragraph_chunks), + text = " ".join(chunk["text"] for chunk in paragraph_chunks), word_count = self.chunk_size, is_part_of = self.document, chunk_index = self.chunk_index, - cut_type = self.paragraph_chunks[len(self.paragraph_chunks) - 1]["cut_type"], + cut_type = paragraph_chunks[len(paragraph_chunks) - 1]["cut_type"], ) except Exception as e: print(e) diff --git a/cognee/tasks/chunks/chunk_by_word.py b/cognee/tasks/chunks/chunk_by_word.py index 8621754d5..e82a9cd98 100644 --- a/cognee/tasks/chunks/chunk_by_word.py +++ b/cognee/tasks/chunks/chunk_by_word.py @@ -1,27 +1,26 @@ import re -def chunk_by_word(data: str): - sentence_endings = r"[.;!?…]" - paragraph_endings = r"[\n\r]" - last_processed_character = "" +SENTENCE_ENDINGS = r"[.;!?…]" +PARAGRAPH_ENDINGS = r"[\n\r]" +def chunk_by_word(data: str): + last_processed_character = "" word = "" i = 0 - while i < len(data): character = data[i] - if word == "" and (re.match(paragraph_endings, character) or character == " "): + if word == "" and (re.match(PARAGRAPH_ENDINGS, character) or character == " "): i = i + 1 continue def is_real_paragraph_end(): - if re.match(sentence_endings, last_processed_character): + if re.match(SENTENCE_ENDINGS, last_processed_character): return True j = i + 1 next_character = data[j] if j < len(data) else None - while next_character is not None and (re.match(paragraph_endings, next_character) or next_character == " "): + while next_character is not None and (re.match(PARAGRAPH_ENDINGS, next_character) or next_character == " "): j += 1 next_character = data[j] if j < len(data) else None if next_character and next_character.isupper(): @@ -29,7 +28,7 @@ def chunk_by_word(data: str): return False - if re.match(paragraph_endings, character): + if re.match(PARAGRAPH_ENDINGS, character): yield (word, "paragraph_end" if is_real_paragraph_end() else "word") word = "" i = i + 1 @@ -44,13 +43,13 @@ def chunk_by_word(data: str): word += character last_processed_character = character - if re.match(sentence_endings, character): + if re.match(SENTENCE_ENDINGS, character): # Check for ellipses. if i + 2 <= len(data) and data[i] == "." and data[i + 1] == "." and data[i + 2] == ".": word += ".." i = i + 2 - is_paragraph_end = i + 1 < len(data) and re.match(paragraph_endings, data[i + 1]) + is_paragraph_end = i + 1 < len(data) and re.match(PARAGRAPH_ENDINGS, data[i + 1]) yield (word, "paragraph_end" if is_paragraph_end else "sentence_end") word = "" From 98cbaaff685b3e2e8db45c016bca2fabc1cb4589 Mon Sep 17 00:00:00 2001 From: Leon Luithlen Date: Wed, 13 Nov 2024 11:30:42 +0100 Subject: [PATCH 02/39] Add isomorphism tests --- .../chunks/chunk_by_paragraph_test2.py | 17 ++ .../chunks/chunk_by_sentence_test.py | 17 ++ .../processing/chunks/chunk_by_word_test.py | 26 ++ .../unit/processing/chunks/test_input.py | 275 ++++++++++++++++++ 4 files changed, 335 insertions(+) create mode 100644 cognee/tests/unit/processing/chunks/chunk_by_paragraph_test2.py create mode 100644 cognee/tests/unit/processing/chunks/chunk_by_sentence_test.py create mode 100644 cognee/tests/unit/processing/chunks/chunk_by_word_test.py create mode 100644 cognee/tests/unit/processing/chunks/test_input.py diff --git a/cognee/tests/unit/processing/chunks/chunk_by_paragraph_test2.py b/cognee/tests/unit/processing/chunks/chunk_by_paragraph_test2.py new file mode 100644 index 000000000..2cb95f416 --- /dev/null +++ b/cognee/tests/unit/processing/chunks/chunk_by_paragraph_test2.py @@ -0,0 +1,17 @@ +import pytest +import numpy as np +from cognee.tasks.chunks import chunk_by_paragraph +from cognee.tests.unit.processing.chunks.test_input import INPUT_TEXTS + +@pytest.mark.parametrize("input_text", [ + INPUT_TEXTS["english_text"], + INPUT_TEXTS["english_lists"], + INPUT_TEXTS["python_code"], + INPUT_TEXTS["chinese_text"] +]) + +def test_chunk_by_paragraph_isomorphism(input_text): + chunks = chunk_by_paragraph(input_text) + reconstructed_text = "".join([chunk["text"] for chunk in chunks]) + assert reconstructed_text == input_text, f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }" + diff --git a/cognee/tests/unit/processing/chunks/chunk_by_sentence_test.py b/cognee/tests/unit/processing/chunks/chunk_by_sentence_test.py new file mode 100644 index 000000000..a21a3e9f9 --- /dev/null +++ b/cognee/tests/unit/processing/chunks/chunk_by_sentence_test.py @@ -0,0 +1,17 @@ +import pytest +import numpy as np +from cognee.tasks.chunks import chunk_by_sentence +from cognee.tests.unit.processing.chunks.test_input import INPUT_TEXTS + +@pytest.mark.parametrize("input_text", [ + INPUT_TEXTS["english_text"], + INPUT_TEXTS["english_lists"], + INPUT_TEXTS["python_code"], + INPUT_TEXTS["chinese_text"] +]) + +def test_chunk_by_sentence_isomorphism(input_text): + chunks = chunk_by_sentence(input_text) + reconstructed_text = "".join([chunk[2] for chunk in chunks]) + assert reconstructed_text == input_text, f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }" + diff --git a/cognee/tests/unit/processing/chunks/chunk_by_word_test.py b/cognee/tests/unit/processing/chunks/chunk_by_word_test.py new file mode 100644 index 000000000..54e19b162 --- /dev/null +++ b/cognee/tests/unit/processing/chunks/chunk_by_word_test.py @@ -0,0 +1,26 @@ +import pytest +import numpy as np +from cognee.tasks.chunks import chunk_by_word +from cognee.tests.unit.processing.chunks.test_input import INPUT_TEXTS + +@pytest.mark.parametrize("input_text", [ + INPUT_TEXTS["english_text"], + INPUT_TEXTS["english_lists"], + INPUT_TEXTS["python_code"], + INPUT_TEXTS["chinese_text"] +]) + +def test_chunk_by_word_isomorphism(input_text): + chunks = chunk_by_word(input_text) + reconstructed_text = "".join([chunk[0] for chunk in chunks]) + assert reconstructed_text == input_text, f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }" + +def test_chunk_by_word_splits(input_text): + chunks = np.array(list(chunk_by_word(input_text))) + space_test = np.array([" " not in chunk[0].strip() for chunk in chunks]) + + assert np.all(space_test), f"These chunks contain spaces within them: {chunks[space_test == False]}" + + + + diff --git a/cognee/tests/unit/processing/chunks/test_input.py b/cognee/tests/unit/processing/chunks/test_input.py new file mode 100644 index 000000000..ad6603d9d --- /dev/null +++ b/cognee/tests/unit/processing/chunks/test_input.py @@ -0,0 +1,275 @@ +import pytest + +INPUT_TEXTS = { + "english_lists": """Let me think through the key attributes that would be important to test in a text chunking system. +Here are the essential attributes to test: + +Chunking Boundaries Accuracy: + + +Proper sentence boundary detection +Handling of punctuation marks +Recognition of paragraph breaks +Treatment of special characters and whitespace +Proper handling of quotes and nested text structures + + +Language Support: + + +Handling of different languages and scripts +Support for multilingual documents +Proper Unicode handling +Treatment of language-specific punctuation + + +Special Cases Handling: + + +Lists and bullet points +Tables and structured content +Code blocks or technical content +Citations and references +Headers and footers +URLs and email addresses + + +Performance Metrics: + + +Processing speed for different text lengths +Memory usage with large documents +Scalability with increasing document size +Consistency across multiple runs + + +Document Format Support: + + +Plain text handling +HTML/XML content +PDF text extraction +Markdown formatting +Mixed format documents + + +Error Handling: + + +Malformed input text +Incomplete sentences +Truncated documents +Invalid characters +Missing punctuation + + +Configuration Flexibility: + + +Adjustable chunk sizes +Customizable boundary rules +Configurable overlap between chunks +Token vs. character-based chunking options + + +Preservation of Context: + + +Maintaining semantic coherence +Preserving contextual relationships +Handling cross-references +Maintaining document structure + +Would you like me to elaborate on any of these attributes or discuss specific testing strategies for them?""", + "python_code": """from typing import ( + Literal as L, + Any, + TypeAlias, + overload, + TypeVar, + Protocol, + type_check_only, +) + +from numpy import generic + +from numpy._typing import ( + ArrayLike, + NDArray, + _ArrayLikeInt, + _ArrayLike, +) + +__all__ = ["pad"] + +_SCT = TypeVar("_SCT", bound=generic) + +@type_check_only +class _ModeFunc(Protocol): + def __call__( + self, + vector: NDArray[Any], + iaxis_pad_width: tuple[int, int], + iaxis: int, + kwargs: dict[str, Any], + /, + ) -> None: ... + +_ModeKind: TypeAlias = L[ + "constant", + "edge", + "linear_ramp", + "maximum", + "mean", + "median", + "minimum", + "reflect", + "symmetric", + "wrap", + "empty", +] + + +# TODO: In practice each keyword argument is exclusive to one or more +# specific modes. Consider adding more overloads to express this in the future. + +# Expand `**kwargs` into explicit keyword-only arguments +@overload +def pad( + array: _ArrayLike[_SCT], + pad_width: _ArrayLikeInt, + mode: _ModeKind = ..., + *, + stat_length: None | _ArrayLikeInt = ..., + constant_values: ArrayLike = ..., + end_values: ArrayLike = ..., + reflect_type: L["odd", "even"] = ..., +) -> NDArray[_SCT]: ... +@overload +def pad( + array: ArrayLike, + pad_width: _ArrayLikeInt, + mode: _ModeKind = ..., + *, + stat_length: None | _ArrayLikeInt = ..., + constant_values: ArrayLike = ..., + end_values: ArrayLike = ..., + reflect_type: L["odd", "even"] = ..., +) -> NDArray[Any]: ... +@overload +def pad( + array: _ArrayLike[_SCT], + pad_width: _ArrayLikeInt, + mode: _ModeFunc, + **kwargs: Any, +) -> NDArray[_SCT]: ... +@overload +def pad( + array: ArrayLike, + pad_width: _ArrayLikeInt, + mode: _ModeFunc, + **kwargs: Any, +) -> NDArray[Any]: ...""", + "chinese_text": """在这个繁华的城市里,藏着一个古老的小巷,名叫杨柳巷。巷子两旁的青石板路已经被无数行人的脚步磨得发亮,斑驳的老墙上爬满了常青藤,给这个充满历史气息的小巷增添了一抹生机。每天清晨,巷子里都会飘出阵阵香气,那是张婆婆家的早点铺子散发出的包子和豆浆的味道。老店门前经常排着长队,有步履匆匆的上班族,也有悠闲散步的老人。巷子深处有一家传统的茶馆,古色古香的木桌椅上总是坐满了品茶聊天的街坊邻里。傍晚时分,夕阳的余晖洒在石板路上,为这个充满生活气息的小巷染上一层温暖的金色。街角的老榕树下,常常有卖唱的艺人在这里驻足,用沧桑的嗓音讲述着这座城市的故事。偶尔,还能看到三三两两的游客举着相机,试图捕捉这里独特的市井风情。这条看似普通的小巷,承载着太多市民的回忆和岁月的痕迹,它就像是这座城市的一个缩影,悄悄地诉说着曾经的故事。""", + "english_text": """O for that warning voice, which he who saw +Th' Apocalyps, heard cry in Heaven aloud, +Then when the Dragon, put to second rout, +Came furious down to be reveng'd on men, +Wo to the inhabitants on Earth! that now, [ 5 ] +While time was, our first-Parents had bin warnd +The coming of thir secret foe, and scap'd +Haply so scap'd his mortal snare; for now +Satan, now first inflam'd with rage, came down, +The Tempter ere th' Accuser of man-kind, [ 10 ] +To wreck on innocent frail man his loss +Of that first Battel, and his flight to Hell: +Yet not rejoycing in his speed, though bold, +Far off and fearless, nor with cause to boast, +Begins his dire attempt, which nigh the birth [ 15 ] +Now rowling, boiles in his tumultuous brest, +And like a devillish Engine back recoiles +Upon himself; horror and doubt distract +His troubl'd thoughts, and from the bottom stirr +The Hell within him, for within him Hell [ 20 ] +He brings, and round about him, nor from Hell +One step no more then from himself can fly +By change of place: Now conscience wakes despair +That slumberd, wakes the bitter memorie +Of what he was, what is, and what must be [ 25 ] +Worse; of worse deeds worse sufferings must ensue. +Sometimes towards Eden which now in his view +Lay pleasant, his grievd look he fixes sad, +Sometimes towards Heav'n and the full-blazing Sun, +Which now sat high in his Meridian Towre: [ 30 ] +Then much revolving, thus in sighs began. + +O thou that with surpassing Glory crownd, +Look'st from thy sole Dominion like the God +Of this new World; at whose sight all the Starrs +Hide thir diminisht heads; to thee I call, [ 35 ] +But with no friendly voice, and add thy name +O Sun, to tell thee how I hate thy beams +That bring to my remembrance from what state +I fell, how glorious once above thy Spheare; +Till Pride and worse Ambition threw me down [ 40 ] +Warring in Heav'n against Heav'ns matchless King: +Ah wherefore! he deservd no such return +From me, whom he created what I was +In that bright eminence, and with his good +Upbraided none; nor was his service hard. [ 45 ] +What could be less then to afford him praise, +The easiest recompence, and pay him thanks, +How due! yet all his good prov'd ill in me, +And wrought but malice; lifted up so high +I sdeind subjection, and thought one step higher [ 50 ] +Would set me highest, and in a moment quit +The debt immense of endless gratitude, +So burthensome, still paying, still to ow; +Forgetful what from him I still receivd, +And understood not that a grateful mind [ 55 ] +By owing owes not, but still pays, at once +Indebted and dischargd; what burden then? +O had his powerful Destiny ordaind +Me some inferiour Angel, I had stood +Then happie; no unbounded hope had rais'd [ 60 ] +Ambition. Yet why not? som other Power +As great might have aspir'd, and me though mean +Drawn to his part; but other Powers as great +Fell not, but stand unshak'n, from within +Or from without, to all temptations arm'd. [ 65 ] +Hadst thou the same free Will and Power to stand? +Thou hadst: whom hast thou then or what to accuse, +But Heav'ns free Love dealt equally to all? +Be then his Love accurst, since love or hate, +To me alike, it deals eternal woe. [ 70 ] +Nay curs'd be thou; since against his thy will +Chose freely what it now so justly rues. +Me miserable! which way shall I flie +Infinite wrauth, and infinite despaire? +Which way I flie is Hell; my self am Hell; [ 75 ] +And in the lowest deep a lower deep +Still threatning to devour me opens wide, +To which the Hell I suffer seems a Heav'n. +O then at last relent: is there no place +Left for Repentance, none for Pardon left? [ 80 ] +None left but by submission; and that word +Disdain forbids me, and my dread of shame +Among the Spirits beneath, whom I seduc'd +With other promises and other vaunts +Then to submit, boasting I could subdue [ 85 ] +Th' Omnipotent. Ay me, they little know +How dearly I abide that boast so vaine, +Under what torments inwardly I groane: +While they adore me on the Throne of Hell, +With Diadem and Sceptre high advanc'd [ 90 ] +The lower still I fall, onely Supream +In miserie; such joy Ambition findes. +But say I could repent and could obtaine +By Act of Grace my former state; how soon +Would higth recall high thoughts, how soon unsay [ 95 ] +What feign'd submission swore: ease would recant +Vows made in pain, as violent and void. +For never can true reconcilement grow +Where wounds of deadly hate have peirc'd so deep: +Which would but lead me to a worse relapse [ 100 ]""" +} \ No newline at end of file From 830c6710e074d06504fc02231b5d1a34fec61f21 Mon Sep 17 00:00:00 2001 From: Leon Luithlen Date: Wed, 13 Nov 2024 11:45:56 +0100 Subject: [PATCH 03/39] Fix chunk_by_word_test --- cognee/tests/unit/processing/chunks/chunk_by_word_test.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/cognee/tests/unit/processing/chunks/chunk_by_word_test.py b/cognee/tests/unit/processing/chunks/chunk_by_word_test.py index 54e19b162..38e5d9b5a 100644 --- a/cognee/tests/unit/processing/chunks/chunk_by_word_test.py +++ b/cognee/tests/unit/processing/chunks/chunk_by_word_test.py @@ -9,12 +9,17 @@ from cognee.tests.unit.processing.chunks.test_input import INPUT_TEXTS INPUT_TEXTS["python_code"], INPUT_TEXTS["chinese_text"] ]) - def test_chunk_by_word_isomorphism(input_text): chunks = chunk_by_word(input_text) reconstructed_text = "".join([chunk[0] for chunk in chunks]) assert reconstructed_text == input_text, f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }" +@pytest.mark.parametrize("input_text", [ + INPUT_TEXTS["english_text"], + INPUT_TEXTS["english_lists"], + INPUT_TEXTS["python_code"], + INPUT_TEXTS["chinese_text"] +]) def test_chunk_by_word_splits(input_text): chunks = np.array(list(chunk_by_word(input_text))) space_test = np.array([" " not in chunk[0].strip() for chunk in chunks]) From c054e897a33f0a343abe12c4effdf3d747957be5 Mon Sep 17 00:00:00 2001 From: Leon Luithlen Date: Wed, 13 Nov 2024 11:47:13 +0100 Subject: [PATCH 04/39] Make chunk_by_word isomorphic --- cognee/tasks/chunks/chunk_by_word.py | 84 +++++++++++++++++----------- 1 file changed, 51 insertions(+), 33 deletions(-) diff --git a/cognee/tasks/chunks/chunk_by_word.py b/cognee/tasks/chunks/chunk_by_word.py index e82a9cd98..7ebf4bced 100644 --- a/cognee/tasks/chunks/chunk_by_word.py +++ b/cognee/tasks/chunks/chunk_by_word.py @@ -4,20 +4,29 @@ SENTENCE_ENDINGS = r"[.;!?…]" PARAGRAPH_ENDINGS = r"[\n\r]" def chunk_by_word(data: str): + """ + Chunks text into words and endings while preserving whitespace. + Whitespace is included with the preceding word. + Outputs can be joined with "" to recreate the original input. + """ last_processed_character = "" - word = "" + current_chunk = "" i = 0 + + # Handle leading whitespace if any + while i < len(data) and (re.match(PARAGRAPH_ENDINGS, data[i]) or data[i] == " "): + current_chunk += data[i] + i += 1 + if current_chunk: + yield (current_chunk, "word") + current_chunk = "" + while i < len(data): character = data[i] - - if word == "" and (re.match(PARAGRAPH_ENDINGS, character) or character == " "): - i = i + 1 - continue - + def is_real_paragraph_end(): if re.match(SENTENCE_ENDINGS, last_processed_character): return True - j = i + 1 next_character = data[j] if j < len(data) else None while next_character is not None and (re.match(PARAGRAPH_ENDINGS, next_character) or next_character == " "): @@ -25,35 +34,44 @@ def chunk_by_word(data: str): next_character = data[j] if j < len(data) else None if next_character and next_character.isupper(): return True - return False - + if re.match(PARAGRAPH_ENDINGS, character): - yield (word, "paragraph_end" if is_real_paragraph_end() else "word") - word = "" - i = i + 1 + if current_chunk: + yield (current_chunk, "word") + current_chunk = "" + yield (character, "paragraph_end" if is_real_paragraph_end() else "word") + i += 1 continue - - if character == " ": - yield [word, "word"] - word = "" - i = i + 1 - continue - - word += character + + current_chunk += character last_processed_character = character - + + if character == " ": + yield (current_chunk, "word") + current_chunk = "" + i += 1 + continue + if re.match(SENTENCE_ENDINGS, character): - # Check for ellipses. - if i + 2 <= len(data) and data[i] == "." and data[i + 1] == "." and data[i + 2] == ".": - word += ".." - i = i + 2 - - is_paragraph_end = i + 1 < len(data) and re.match(PARAGRAPH_ENDINGS, data[i + 1]) - yield (word, "paragraph_end" if is_paragraph_end else "sentence_end") - word = "" - + # Check for ellipses + if i + 2 < len(data) and data[i:i+3] == "...": + current_chunk += ".." + i += 2 + + # Look ahead for whitespace + next_i = i + 1 + while next_i < len(data) and data[next_i] == " ": + current_chunk += data[next_i] + next_i += 1 + + is_paragraph_end = next_i < len(data) and re.match(PARAGRAPH_ENDINGS, data[next_i]) + yield (current_chunk, "paragraph_end" if is_paragraph_end else "sentence_end") + current_chunk = "" + i = next_i + continue + i += 1 - - if len(word) > 0: - yield (word, "word") + + if current_chunk: + yield (current_chunk, "word") \ No newline at end of file From ab55a73d182ff4ecee70151438ef175b52bbe6de Mon Sep 17 00:00:00 2001 From: Leon Luithlen Date: Wed, 13 Nov 2024 11:52:20 +0100 Subject: [PATCH 05/39] Adapt chunk_by_sentence to isomorphic chunk_by_word --- cognee/tasks/chunks/chunk_by_sentence.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cognee/tasks/chunks/chunk_by_sentence.py b/cognee/tasks/chunks/chunk_by_sentence.py index 5b4a40bc1..6a752caee 100644 --- a/cognee/tasks/chunks/chunk_by_sentence.py +++ b/cognee/tasks/chunks/chunk_by_sentence.py @@ -11,7 +11,7 @@ def chunk_by_sentence(data: str): word_count = 0 for (word, word_type) in chunk_by_word(data): - sentence += (" " if len(sentence) > 0 else "") + word + sentence += word word_count += 1 if word_type == "paragraph_end" or word_type == "sentence_end": From ce498d97dd0de03b399208ff4f7e3e28d1ceb873 Mon Sep 17 00:00:00 2001 From: Leon Luithlen Date: Wed, 13 Nov 2024 12:11:56 +0100 Subject: [PATCH 06/39] Refactor chunk_by_paragraph to be isomorphic --- cognee/tasks/chunks/chunk_by_paragraph.py | 133 ++++++++++++---------- 1 file changed, 71 insertions(+), 62 deletions(-) diff --git a/cognee/tasks/chunks/chunk_by_paragraph.py b/cognee/tasks/chunks/chunk_by_paragraph.py index eae5f812f..24f55b118 100644 --- a/cognee/tasks/chunks/chunk_by_paragraph.py +++ b/cognee/tasks/chunks/chunk_by_paragraph.py @@ -1,69 +1,78 @@ from uuid import uuid5, NAMESPACE_OID +from typing import Dict, Any, Iterator from .chunk_by_sentence import chunk_by_sentence -def chunk_by_paragraph(data: str, paragraph_length: int = 1024, batch_paragraphs = True): - paragraph = "" - last_cut_type = None +def chunk_by_paragraph(data: str, paragraph_length: int = 1024, batch_paragraphs: bool = True) -> Iterator[Dict[str, Any]]: + """ + Chunks text by paragraph while preserving exact text reconstruction capability. + When chunks are joined with empty string "", they reproduce the original text exactly. + """ + current_chunk = "" + current_word_count = 0 + chunk_index = 0 last_paragraph_id = None - paragraph_word_count = 0 - paragraph_chunk_index = 0 - - for (paragraph_id, __, sentence, word_count, end_type) in chunk_by_sentence(data): - if paragraph_word_count > 0 and paragraph_word_count + word_count > paragraph_length: - if batch_paragraphs is True: - chunk_id = uuid5(NAMESPACE_OID, paragraph) - yield dict( - text = paragraph.strip(), - word_count = paragraph_word_count, - id = chunk_id, # When batching paragraphs, the paragraph_id is the same as chunk_id. - # paragraph_id doens't mean anything since multiple paragraphs are merged. - chunk_id = chunk_id, - chunk_index = paragraph_chunk_index, - cut_type = last_cut_type, - ) + last_cut_type = None + + for paragraph_id, _, sentence, word_count, end_type in chunk_by_sentence(data): + # Check if this sentence would exceed length limit + if current_word_count > 0 and current_word_count + word_count > paragraph_length: + # Yield current chunk + chunk_dict = { + "text": current_chunk, + "word_count": current_word_count, + "chunk_id": uuid5(NAMESPACE_OID, current_chunk), + "chunk_index": chunk_index, + "cut_type": last_cut_type + } + + if batch_paragraphs: + chunk_dict["id"] = chunk_dict["chunk_id"] else: - yield dict( - text = paragraph.strip(), - word_count = paragraph_word_count, - id = last_paragraph_id, - chunk_id = uuid5(NAMESPACE_OID, paragraph), - chunk_index = paragraph_chunk_index, - cut_type = last_cut_type, - ) - - paragraph_chunk_index += 1 - paragraph_word_count = 0 - paragraph = "" - - paragraph += (" " if len(paragraph) > 0 else "") + sentence - paragraph_word_count += word_count - - if end_type == "paragraph_end" or end_type == "sentence_cut": - if batch_paragraphs is True: - paragraph += "\n\n" if end_type == "paragraph_end" else "" - else: - yield dict( - text = paragraph.strip(), - word_count = paragraph_word_count, - paragraph_id = paragraph_id, - chunk_id = uuid5(NAMESPACE_OID, paragraph), - chunk_index = paragraph_chunk_index, - cut_type = end_type, - ) - - paragraph_chunk_index = 0 - paragraph_word_count = 0 - paragraph = "" - + chunk_dict["id"] = last_paragraph_id + + yield chunk_dict + + # Start new chunk with current sentence + current_chunk = sentence + current_word_count = word_count + chunk_index += 1 + else: + # Just concatenate directly - no space handling + current_chunk += sentence + current_word_count += word_count + + # Handle end of paragraph + if end_type in ("paragraph_end", "sentence_cut") and not batch_paragraphs: + # For non-batch mode, yield each paragraph separately + chunk_dict = { + "text": current_chunk, + "word_count": current_word_count, + "id": paragraph_id, + "chunk_id": uuid5(NAMESPACE_OID, current_chunk), + "chunk_index": chunk_index, + "cut_type": end_type + } + yield chunk_dict + current_chunk = "" + current_word_count = 0 + chunk_index = 0 + last_cut_type = end_type last_paragraph_id = paragraph_id - - if len(paragraph) > 0: - yield dict( - chunk_id = uuid5(NAMESPACE_OID, paragraph), - text = paragraph, - word_count = paragraph_word_count, - paragraph_id = last_paragraph_id, - chunk_index = paragraph_chunk_index, - cut_type = last_cut_type, - ) + + # Yield any remaining text + if current_chunk: + chunk_dict = { + "text": current_chunk, + "word_count": current_word_count, + "chunk_id": uuid5(NAMESPACE_OID, current_chunk), + "chunk_index": chunk_index, + "cut_type": last_cut_type + } + + if batch_paragraphs: + chunk_dict["id"] = chunk_dict["chunk_id"] + else: + chunk_dict["id"] = last_paragraph_id + + yield chunk_dict \ No newline at end of file From 92a66dddb91addbf52ae3bc58093deeca176adf1 Mon Sep 17 00:00:00 2001 From: Leon Luithlen Date: Wed, 13 Nov 2024 12:13:12 +0100 Subject: [PATCH 07/39] Autoformat chunking tests --- .../chunks/chunk_by_paragraph_test2.py | 23 +++++---- .../chunks/chunk_by_sentence_test.py | 23 +++++---- .../processing/chunks/chunk_by_word_test.py | 47 +++++++++++-------- .../unit/processing/chunks/test_input.py | 4 +- 4 files changed, 58 insertions(+), 39 deletions(-) diff --git a/cognee/tests/unit/processing/chunks/chunk_by_paragraph_test2.py b/cognee/tests/unit/processing/chunks/chunk_by_paragraph_test2.py index 2cb95f416..16767f736 100644 --- a/cognee/tests/unit/processing/chunks/chunk_by_paragraph_test2.py +++ b/cognee/tests/unit/processing/chunks/chunk_by_paragraph_test2.py @@ -1,17 +1,22 @@ -import pytest import numpy as np +import pytest + from cognee.tasks.chunks import chunk_by_paragraph from cognee.tests.unit.processing.chunks.test_input import INPUT_TEXTS -@pytest.mark.parametrize("input_text", [ - INPUT_TEXTS["english_text"], - INPUT_TEXTS["english_lists"], - INPUT_TEXTS["python_code"], - INPUT_TEXTS["chinese_text"] -]) +@pytest.mark.parametrize( + "input_text", + [ + INPUT_TEXTS["english_text"], + INPUT_TEXTS["english_lists"], + INPUT_TEXTS["python_code"], + INPUT_TEXTS["chinese_text"], + ], +) def test_chunk_by_paragraph_isomorphism(input_text): chunks = chunk_by_paragraph(input_text) reconstructed_text = "".join([chunk["text"] for chunk in chunks]) - assert reconstructed_text == input_text, f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }" - + assert ( + reconstructed_text == input_text + ), f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }" diff --git a/cognee/tests/unit/processing/chunks/chunk_by_sentence_test.py b/cognee/tests/unit/processing/chunks/chunk_by_sentence_test.py index a21a3e9f9..fcab02e03 100644 --- a/cognee/tests/unit/processing/chunks/chunk_by_sentence_test.py +++ b/cognee/tests/unit/processing/chunks/chunk_by_sentence_test.py @@ -1,17 +1,22 @@ -import pytest import numpy as np +import pytest + from cognee.tasks.chunks import chunk_by_sentence from cognee.tests.unit.processing.chunks.test_input import INPUT_TEXTS -@pytest.mark.parametrize("input_text", [ - INPUT_TEXTS["english_text"], - INPUT_TEXTS["english_lists"], - INPUT_TEXTS["python_code"], - INPUT_TEXTS["chinese_text"] -]) +@pytest.mark.parametrize( + "input_text", + [ + INPUT_TEXTS["english_text"], + INPUT_TEXTS["english_lists"], + INPUT_TEXTS["python_code"], + INPUT_TEXTS["chinese_text"], + ], +) def test_chunk_by_sentence_isomorphism(input_text): chunks = chunk_by_sentence(input_text) reconstructed_text = "".join([chunk[2] for chunk in chunks]) - assert reconstructed_text == input_text, f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }" - + assert ( + reconstructed_text == input_text + ), f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }" diff --git a/cognee/tests/unit/processing/chunks/chunk_by_word_test.py b/cognee/tests/unit/processing/chunks/chunk_by_word_test.py index 38e5d9b5a..42523c106 100644 --- a/cognee/tests/unit/processing/chunks/chunk_by_word_test.py +++ b/cognee/tests/unit/processing/chunks/chunk_by_word_test.py @@ -1,31 +1,40 @@ -import pytest import numpy as np +import pytest + from cognee.tasks.chunks import chunk_by_word from cognee.tests.unit.processing.chunks.test_input import INPUT_TEXTS -@pytest.mark.parametrize("input_text", [ - INPUT_TEXTS["english_text"], - INPUT_TEXTS["english_lists"], - INPUT_TEXTS["python_code"], - INPUT_TEXTS["chinese_text"] -]) + +@pytest.mark.parametrize( + "input_text", + [ + INPUT_TEXTS["english_text"], + INPUT_TEXTS["english_lists"], + INPUT_TEXTS["python_code"], + INPUT_TEXTS["chinese_text"], + ], +) def test_chunk_by_word_isomorphism(input_text): chunks = chunk_by_word(input_text) reconstructed_text = "".join([chunk[0] for chunk in chunks]) - assert reconstructed_text == input_text, f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }" + assert ( + reconstructed_text == input_text + ), f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }" -@pytest.mark.parametrize("input_text", [ - INPUT_TEXTS["english_text"], - INPUT_TEXTS["english_lists"], - INPUT_TEXTS["python_code"], - INPUT_TEXTS["chinese_text"] -]) + +@pytest.mark.parametrize( + "input_text", + [ + INPUT_TEXTS["english_text"], + INPUT_TEXTS["english_lists"], + INPUT_TEXTS["python_code"], + INPUT_TEXTS["chinese_text"], + ], +) def test_chunk_by_word_splits(input_text): chunks = np.array(list(chunk_by_word(input_text))) space_test = np.array([" " not in chunk[0].strip() for chunk in chunks]) - assert np.all(space_test), f"These chunks contain spaces within them: {chunks[space_test == False]}" - - - - + assert np.all( + space_test + ), f"These chunks contain spaces within them: {chunks[space_test == False]}" diff --git a/cognee/tests/unit/processing/chunks/test_input.py b/cognee/tests/unit/processing/chunks/test_input.py index ad6603d9d..b7a57b75a 100644 --- a/cognee/tests/unit/processing/chunks/test_input.py +++ b/cognee/tests/unit/processing/chunks/test_input.py @@ -271,5 +271,5 @@ What feign'd submission swore: ease would recant Vows made in pain, as violent and void. For never can true reconcilement grow Where wounds of deadly hate have peirc'd so deep: -Which would but lead me to a worse relapse [ 100 ]""" -} \ No newline at end of file +Which would but lead me to a worse relapse [ 100 ]""", +} From ef7a19043d0c364585ed3f28fecabf08b3bb4cc6 Mon Sep 17 00:00:00 2001 From: Leon Luithlen Date: Wed, 13 Nov 2024 13:33:08 +0100 Subject: [PATCH 08/39] Adapt chunk_by_paragraph test parametrization --- .../chunks/chunk_by_paragraph_test2.py | 34 +++++++++++++++---- 1 file changed, 27 insertions(+), 7 deletions(-) diff --git a/cognee/tests/unit/processing/chunks/chunk_by_paragraph_test2.py b/cognee/tests/unit/processing/chunks/chunk_by_paragraph_test2.py index 16767f736..d846fdfa2 100644 --- a/cognee/tests/unit/processing/chunks/chunk_by_paragraph_test2.py +++ b/cognee/tests/unit/processing/chunks/chunk_by_paragraph_test2.py @@ -6,16 +6,36 @@ from cognee.tests.unit.processing.chunks.test_input import INPUT_TEXTS @pytest.mark.parametrize( - "input_text", + "input_text,paragraph_length,batch_paragraphs", [ - INPUT_TEXTS["english_text"], - INPUT_TEXTS["english_lists"], - INPUT_TEXTS["python_code"], - INPUT_TEXTS["chinese_text"], + (INPUT_TEXTS["english_text"], 64, True), + (INPUT_TEXTS["english_text"], 64, False), + (INPUT_TEXTS["english_text"], 256, True), + (INPUT_TEXTS["english_text"], 256, False), + (INPUT_TEXTS["english_text"], 1024, True), + (INPUT_TEXTS["english_text"], 1024, False), + (INPUT_TEXTS["english_lists"], 64, True), + (INPUT_TEXTS["english_lists"], 64, False), + (INPUT_TEXTS["english_lists"], 256, True), + (INPUT_TEXTS["english_lists"], 256, False), + (INPUT_TEXTS["english_lists"], 1024, True), + (INPUT_TEXTS["english_lists"], 1024, False), + (INPUT_TEXTS["python_code"], 64, True), + (INPUT_TEXTS["python_code"], 64, False), + (INPUT_TEXTS["python_code"], 256, True), + (INPUT_TEXTS["python_code"], 256, False), + (INPUT_TEXTS["python_code"], 1024, True), + (INPUT_TEXTS["python_code"], 1024, False), + (INPUT_TEXTS["chinese_text"], 64, True), + (INPUT_TEXTS["chinese_text"], 64, False), + (INPUT_TEXTS["chinese_text"], 256, True), + (INPUT_TEXTS["chinese_text"], 256, False), + (INPUT_TEXTS["chinese_text"], 1024, True), + (INPUT_TEXTS["chinese_text"], 1024, False), ], ) -def test_chunk_by_paragraph_isomorphism(input_text): - chunks = chunk_by_paragraph(input_text) +def test_chunk_by_paragraph_isomorphism(input_text, paragraph_length, batch_paragraphs): + chunks = chunk_by_paragraph(input_text, paragraph_length, batch_paragraphs) reconstructed_text = "".join([chunk["text"] for chunk in chunks]) assert ( reconstructed_text == input_text From f8e5b529c3825481e4f056994448ead8a780d4fa Mon Sep 17 00:00:00 2001 From: Leon Luithlen Date: Wed, 13 Nov 2024 13:58:00 +0100 Subject: [PATCH 09/39] Add maximum_length argument to chunk_sentences --- cognee/tasks/chunks/chunk_by_paragraph.py | 3 +- cognee/tasks/chunks/chunk_by_sentence.py | 5 ++- .../chunks/chunk_by_paragraph_test2.py | 44 ++++++++++++++++++- 3 files changed, 48 insertions(+), 4 deletions(-) diff --git a/cognee/tasks/chunks/chunk_by_paragraph.py b/cognee/tasks/chunks/chunk_by_paragraph.py index 24f55b118..11ab8dd41 100644 --- a/cognee/tasks/chunks/chunk_by_paragraph.py +++ b/cognee/tasks/chunks/chunk_by_paragraph.py @@ -13,7 +13,8 @@ def chunk_by_paragraph(data: str, paragraph_length: int = 1024, batch_paragraphs last_paragraph_id = None last_cut_type = None - for paragraph_id, _, sentence, word_count, end_type in chunk_by_sentence(data): + for paragraph_id, _, sentence, word_count, end_type in chunk_by_sentence(data, maximum_length=paragraph_length): + assert word_count <= paragraph_length, f"{paragraph_length = } is smaller than {word_count = }" # Check if this sentence would exceed length limit if current_word_count > 0 and current_word_count + word_count > paragraph_length: # Yield current chunk diff --git a/cognee/tasks/chunks/chunk_by_sentence.py b/cognee/tasks/chunks/chunk_by_sentence.py index 6a752caee..7191a78c4 100644 --- a/cognee/tasks/chunks/chunk_by_sentence.py +++ b/cognee/tasks/chunks/chunk_by_sentence.py @@ -2,9 +2,10 @@ from uuid import uuid4 +from typing import Optional from .chunk_by_word import chunk_by_word -def chunk_by_sentence(data: str): +def chunk_by_sentence(data: str, maximum_length: Optional[int]): sentence = "" paragraph_id = uuid4() chunk_index = 0 @@ -14,7 +15,7 @@ def chunk_by_sentence(data: str): sentence += word word_count += 1 - if word_type == "paragraph_end" or word_type == "sentence_end": + if word_type == "paragraph_end" or word_type == "sentence_end" or ((word_count is not None) and (word_count == maximum_length)): yield (paragraph_id, chunk_index, sentence, word_count, word_type) sentence = "" word_count = 0 diff --git a/cognee/tests/unit/processing/chunks/chunk_by_paragraph_test2.py b/cognee/tests/unit/processing/chunks/chunk_by_paragraph_test2.py index d846fdfa2..ef75094c4 100644 --- a/cognee/tests/unit/processing/chunks/chunk_by_paragraph_test2.py +++ b/cognee/tests/unit/processing/chunks/chunk_by_paragraph_test2.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from cognee.tasks.chunks import chunk_by_paragraph +from cognee.tasks.chunks import chunk_by_paragraph, chunk_by_word from cognee.tests.unit.processing.chunks.test_input import INPUT_TEXTS @@ -40,3 +40,45 @@ def test_chunk_by_paragraph_isomorphism(input_text, paragraph_length, batch_para assert ( reconstructed_text == input_text ), f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }" + + +@pytest.mark.parametrize( + "input_text,paragraph_length,batch_paragraphs", + [ + (INPUT_TEXTS["english_text"], 64, True), + (INPUT_TEXTS["english_text"], 64, False), + (INPUT_TEXTS["english_text"], 256, True), + (INPUT_TEXTS["english_text"], 256, False), + (INPUT_TEXTS["english_text"], 1024, True), + (INPUT_TEXTS["english_text"], 1024, False), + (INPUT_TEXTS["english_lists"], 64, True), + (INPUT_TEXTS["english_lists"], 64, False), + (INPUT_TEXTS["english_lists"], 256, True), + (INPUT_TEXTS["english_lists"], 256, False), + (INPUT_TEXTS["english_lists"], 1024, True), + (INPUT_TEXTS["english_lists"], 1024, False), + (INPUT_TEXTS["python_code"], 64, True), + (INPUT_TEXTS["python_code"], 64, False), + (INPUT_TEXTS["python_code"], 256, True), + (INPUT_TEXTS["python_code"], 256, False), + (INPUT_TEXTS["python_code"], 1024, True), + (INPUT_TEXTS["python_code"], 1024, False), + (INPUT_TEXTS["chinese_text"], 64, True), + (INPUT_TEXTS["chinese_text"], 64, False), + (INPUT_TEXTS["chinese_text"], 256, True), + (INPUT_TEXTS["chinese_text"], 256, False), + (INPUT_TEXTS["chinese_text"], 1024, True), + (INPUT_TEXTS["chinese_text"], 1024, False), + ], +) +def test_paragraph_chunk_length(input_text, paragraph_length, batch_paragraphs): + chunks = list(chunk_by_paragraph(input_text, paragraph_length, batch_paragraphs)) + + chunk_lengths = np.array( + [len(list(chunk_by_word(chunk["text"]))) for chunk in chunks] + ) + + larger_chunks = chunk_lengths[chunk_lengths > paragraph_length] + assert np.all( + chunk_lengths <= paragraph_length + ), f"{paragraph_length = }: {larger_chunks} are too large" From 1b4a7e4fdcdd83305fdb1cf2398ec0a195efe3ed Mon Sep 17 00:00:00 2001 From: Leon Luithlen Date: Wed, 13 Nov 2024 14:17:00 +0100 Subject: [PATCH 10/39] Adapt chunk_by_paragraph_test.py --- .../chunks/chunk_by_paragraph_test.py | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/cognee/tests/unit/processing/chunks/chunk_by_paragraph_test.py b/cognee/tests/unit/processing/chunks/chunk_by_paragraph_test.py index 24c3cc147..28b4b37c3 100644 --- a/cognee/tests/unit/processing/chunks/chunk_by_paragraph_test.py +++ b/cognee/tests/unit/processing/chunks/chunk_by_paragraph_test.py @@ -3,13 +3,13 @@ from cognee.tasks.chunks import chunk_by_paragraph GROUND_TRUTH = { "whole_text": [ { - "text": "This is example text. It contains multiple sentences.", - "word_count": 8, + "text": "This is example text. It contains multiple sentences.\n", + "word_count": 9, "cut_type": "paragraph_end", }, { - "text": "This is a second paragraph. First two paragraphs are whole.", - "word_count": 10, + "text": "This is a second paragraph. First two paragraphs are whole.\n", + "word_count": 11, "cut_type": "paragraph_end", }, { @@ -20,30 +20,30 @@ GROUND_TRUTH = { ], "cut_text": [ { - "text": "This is example text. It contains multiple sentences.", - "word_count": 8, + "text": "This is example text. It contains multiple sentences.\n", + "word_count": 9, "cut_type": "paragraph_end", }, { - "text": "This is a second paragraph. First two paragraphs are whole.", - "word_count": 10, + "text": "This is a second paragraph. First two paragraphs are whole.\n", + "word_count": 11, "cut_type": "paragraph_end", }, { "text": "Third paragraph is cut and is missing the dot at the end", "word_count": 12, - "cut_type": "sentence_cut", + "cut_type": "word", }, ], } INPUT_TEXT = { "whole_text": """This is example text. It contains multiple sentences. - This is a second paragraph. First two paragraphs are whole. - Third paragraph is a bit longer and is finished with a dot.""", +This is a second paragraph. First two paragraphs are whole. +Third paragraph is a bit longer and is finished with a dot.""", "cut_text": """This is example text. It contains multiple sentences. - This is a second paragraph. First two paragraphs are whole. - Third paragraph is cut and is missing the dot at the end""", +This is a second paragraph. First two paragraphs are whole. +Third paragraph is cut and is missing the dot at the end""", } From 9b2fb09c5920641a40305154b1a1acb833021107 Mon Sep 17 00:00:00 2001 From: Leon Luithlen Date: Wed, 13 Nov 2024 15:39:17 +0100 Subject: [PATCH 11/39] Fix PdfDocument teset, give chunk_by_sentence a maximum_length arg --- cognee/tasks/chunks/chunk_by_sentence.py | 2 +- cognee/tests/unit/documents/PdfDocument_test.py | 4 ++-- ...unk_by_paragraph_test2.py => chunk_by_paragraph_2_test.py} | 0 .../tests/unit/processing/chunks/chunk_by_paragraph_test.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) rename cognee/tests/unit/processing/chunks/{chunk_by_paragraph_test2.py => chunk_by_paragraph_2_test.py} (100%) diff --git a/cognee/tasks/chunks/chunk_by_sentence.py b/cognee/tasks/chunks/chunk_by_sentence.py index 7191a78c4..1ce052a6c 100644 --- a/cognee/tasks/chunks/chunk_by_sentence.py +++ b/cognee/tasks/chunks/chunk_by_sentence.py @@ -5,7 +5,7 @@ from uuid import uuid4 from typing import Optional from .chunk_by_word import chunk_by_word -def chunk_by_sentence(data: str, maximum_length: Optional[int]): +def chunk_by_sentence(data: str, maximum_length: Optional[int] = None): sentence = "" paragraph_id = uuid4() chunk_index = 0 diff --git a/cognee/tests/unit/documents/PdfDocument_test.py b/cognee/tests/unit/documents/PdfDocument_test.py index 917e9c3e0..108d61273 100644 --- a/cognee/tests/unit/documents/PdfDocument_test.py +++ b/cognee/tests/unit/documents/PdfDocument_test.py @@ -4,8 +4,8 @@ import uuid from cognee.modules.data.processing.document_types.PdfDocument import PdfDocument GROUND_TRUTH = [ - {"word_count": 879, "len_text": 5622, "cut_type": "sentence_end"}, - {"word_count": 951, "len_text": 6384, "cut_type": "sentence_end"}, + {"word_count": 879, "len_text": 5607, "cut_type": "sentence_end"}, + {"word_count": 953, "len_text": 6363, "cut_type": "sentence_end"}, ] diff --git a/cognee/tests/unit/processing/chunks/chunk_by_paragraph_test2.py b/cognee/tests/unit/processing/chunks/chunk_by_paragraph_2_test.py similarity index 100% rename from cognee/tests/unit/processing/chunks/chunk_by_paragraph_test2.py rename to cognee/tests/unit/processing/chunks/chunk_by_paragraph_2_test.py diff --git a/cognee/tests/unit/processing/chunks/chunk_by_paragraph_test.py b/cognee/tests/unit/processing/chunks/chunk_by_paragraph_test.py index 28b4b37c3..f8fe00237 100644 --- a/cognee/tests/unit/processing/chunks/chunk_by_paragraph_test.py +++ b/cognee/tests/unit/processing/chunks/chunk_by_paragraph_test.py @@ -49,7 +49,7 @@ Third paragraph is cut and is missing the dot at the end""", def run_chunking_test(test_text, expected_chunks): chunks = [] - for chunk_data in chunk_by_paragraph(test_text, 12, batch_paragraphs=False): + for chunk_data in chunk_by_paragraph(test_text, 12, batch_paragraphs=True): chunks.append(chunk_data) assert len(chunks) == 3 From 9ea26344809b36e35881112815416ab46c4aad8b Mon Sep 17 00:00:00 2001 From: Leon Luithlen Date: Wed, 13 Nov 2024 15:53:44 +0100 Subject: [PATCH 12/39] Replace word_count with maximum_length in if clause --- cognee/tasks/chunks/chunk_by_sentence.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cognee/tasks/chunks/chunk_by_sentence.py b/cognee/tasks/chunks/chunk_by_sentence.py index 1ce052a6c..2bf1bf8ee 100644 --- a/cognee/tasks/chunks/chunk_by_sentence.py +++ b/cognee/tasks/chunks/chunk_by_sentence.py @@ -15,7 +15,7 @@ def chunk_by_sentence(data: str, maximum_length: Optional[int] = None): sentence += word word_count += 1 - if word_type == "paragraph_end" or word_type == "sentence_end" or ((word_count is not None) and (word_count == maximum_length)): + if word_type == "paragraph_end" or word_type == "sentence_end" or ((maximum_length is not None) and (word_count == maximum_length)): yield (paragraph_id, chunk_index, sentence, word_count, word_type) sentence = "" word_count = 0 From fdec9a692e0b7adb9bbcc39f65251e5620571161 Mon Sep 17 00:00:00 2001 From: Leon Luithlen Date: Wed, 13 Nov 2024 16:03:06 +0100 Subject: [PATCH 13/39] Test maximum_lenth parameter of chunk_by_sentence --- .../chunks/chunk_by_sentence_test.py | 48 +++++++++++++++---- 1 file changed, 40 insertions(+), 8 deletions(-) diff --git a/cognee/tests/unit/processing/chunks/chunk_by_sentence_test.py b/cognee/tests/unit/processing/chunks/chunk_by_sentence_test.py index fcab02e03..45af2ed39 100644 --- a/cognee/tests/unit/processing/chunks/chunk_by_sentence_test.py +++ b/cognee/tests/unit/processing/chunks/chunk_by_sentence_test.py @@ -1,22 +1,54 @@ import numpy as np import pytest -from cognee.tasks.chunks import chunk_by_sentence +from cognee.tasks.chunks import chunk_by_sentence, chunk_by_word from cognee.tests.unit.processing.chunks.test_input import INPUT_TEXTS @pytest.mark.parametrize( - "input_text", + "input_text,maximum_length", [ - INPUT_TEXTS["english_text"], - INPUT_TEXTS["english_lists"], - INPUT_TEXTS["python_code"], - INPUT_TEXTS["chinese_text"], + (INPUT_TEXTS["english_text"], None), + (INPUT_TEXTS["english_text"], 8), + (INPUT_TEXTS["english_text"], 64), + (INPUT_TEXTS["english_lists"], None), + (INPUT_TEXTS["english_lists"], 8), + (INPUT_TEXTS["english_lists"], 64), + (INPUT_TEXTS["python_code"], None), + (INPUT_TEXTS["python_code"], 8), + (INPUT_TEXTS["python_code"], 64), + (INPUT_TEXTS["chinese_text"], None), + (INPUT_TEXTS["chinese_text"], 8), + (INPUT_TEXTS["chinese_text"], 64), ], ) -def test_chunk_by_sentence_isomorphism(input_text): - chunks = chunk_by_sentence(input_text) +def test_chunk_by_sentence_isomorphism(input_text, maximum_length): + chunks = chunk_by_sentence(input_text, maximum_length) reconstructed_text = "".join([chunk[2] for chunk in chunks]) assert ( reconstructed_text == input_text ), f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }" + + +@pytest.mark.parametrize( + "input_text,maximum_length", + [ + (INPUT_TEXTS["english_text"], 8), + (INPUT_TEXTS["english_text"], 64), + (INPUT_TEXTS["english_lists"], 8), + (INPUT_TEXTS["english_lists"], 64), + (INPUT_TEXTS["python_code"], 8), + (INPUT_TEXTS["python_code"], 64), + (INPUT_TEXTS["chinese_text"], 8), + (INPUT_TEXTS["chinese_text"], 64), + ], +) +def test_paragraph_chunk_length(input_text, maximum_length): + chunks = list(chunk_by_sentence(input_text, maximum_length)) + + chunk_lengths = np.array([len(list(chunk_by_word(chunk[2]))) for chunk in chunks]) + + larger_chunks = chunk_lengths[chunk_lengths > maximum_length] + assert np.all( + chunk_lengths <= maximum_length + ), f"{maximum_length = }: {larger_chunks} are too large" From b787407db7ab9c2c114098f61c427f8ec5f1e29b Mon Sep 17 00:00:00 2001 From: Leon Luithlen Date: Wed, 13 Nov 2024 16:23:14 +0100 Subject: [PATCH 14/39] Add more adversarial examples --- cognee/tasks/chunks/chunk_by_sentence.py | 2 +- .../chunks/chunk_by_paragraph_2_test.py | 59 +++---------------- .../chunks/chunk_by_sentence_test.py | 35 ++++------- .../unit/processing/chunks/test_input.py | 8 +++ 4 files changed, 27 insertions(+), 77 deletions(-) diff --git a/cognee/tasks/chunks/chunk_by_sentence.py b/cognee/tasks/chunks/chunk_by_sentence.py index 2bf1bf8ee..9159922af 100644 --- a/cognee/tasks/chunks/chunk_by_sentence.py +++ b/cognee/tasks/chunks/chunk_by_sentence.py @@ -15,7 +15,7 @@ def chunk_by_sentence(data: str, maximum_length: Optional[int] = None): sentence += word word_count += 1 - if word_type == "paragraph_end" or word_type == "sentence_end" or ((maximum_length is not None) and (word_count == maximum_length)): + if word_type == "paragraph_end" or word_type == "sentence_end" or (maximum_length and (word_count == maximum_length)): yield (paragraph_id, chunk_index, sentence, word_count, word_type) sentence = "" word_count = 0 diff --git a/cognee/tests/unit/processing/chunks/chunk_by_paragraph_2_test.py b/cognee/tests/unit/processing/chunks/chunk_by_paragraph_2_test.py index ef75094c4..ad09c9671 100644 --- a/cognee/tests/unit/processing/chunks/chunk_by_paragraph_2_test.py +++ b/cognee/tests/unit/processing/chunks/chunk_by_paragraph_2_test.py @@ -1,38 +1,18 @@ +from itertools import product + import numpy as np import pytest from cognee.tasks.chunks import chunk_by_paragraph, chunk_by_word from cognee.tests.unit.processing.chunks.test_input import INPUT_TEXTS +paragraph_lengths = [64, 256, 1024] +batch_paragraphs_vals = [True, False] + @pytest.mark.parametrize( "input_text,paragraph_length,batch_paragraphs", - [ - (INPUT_TEXTS["english_text"], 64, True), - (INPUT_TEXTS["english_text"], 64, False), - (INPUT_TEXTS["english_text"], 256, True), - (INPUT_TEXTS["english_text"], 256, False), - (INPUT_TEXTS["english_text"], 1024, True), - (INPUT_TEXTS["english_text"], 1024, False), - (INPUT_TEXTS["english_lists"], 64, True), - (INPUT_TEXTS["english_lists"], 64, False), - (INPUT_TEXTS["english_lists"], 256, True), - (INPUT_TEXTS["english_lists"], 256, False), - (INPUT_TEXTS["english_lists"], 1024, True), - (INPUT_TEXTS["english_lists"], 1024, False), - (INPUT_TEXTS["python_code"], 64, True), - (INPUT_TEXTS["python_code"], 64, False), - (INPUT_TEXTS["python_code"], 256, True), - (INPUT_TEXTS["python_code"], 256, False), - (INPUT_TEXTS["python_code"], 1024, True), - (INPUT_TEXTS["python_code"], 1024, False), - (INPUT_TEXTS["chinese_text"], 64, True), - (INPUT_TEXTS["chinese_text"], 64, False), - (INPUT_TEXTS["chinese_text"], 256, True), - (INPUT_TEXTS["chinese_text"], 256, False), - (INPUT_TEXTS["chinese_text"], 1024, True), - (INPUT_TEXTS["chinese_text"], 1024, False), - ], + list(product(list(INPUT_TEXTS.values()), paragraph_lengths, batch_paragraphs_vals)), ) def test_chunk_by_paragraph_isomorphism(input_text, paragraph_length, batch_paragraphs): chunks = chunk_by_paragraph(input_text, paragraph_length, batch_paragraphs) @@ -44,32 +24,7 @@ def test_chunk_by_paragraph_isomorphism(input_text, paragraph_length, batch_para @pytest.mark.parametrize( "input_text,paragraph_length,batch_paragraphs", - [ - (INPUT_TEXTS["english_text"], 64, True), - (INPUT_TEXTS["english_text"], 64, False), - (INPUT_TEXTS["english_text"], 256, True), - (INPUT_TEXTS["english_text"], 256, False), - (INPUT_TEXTS["english_text"], 1024, True), - (INPUT_TEXTS["english_text"], 1024, False), - (INPUT_TEXTS["english_lists"], 64, True), - (INPUT_TEXTS["english_lists"], 64, False), - (INPUT_TEXTS["english_lists"], 256, True), - (INPUT_TEXTS["english_lists"], 256, False), - (INPUT_TEXTS["english_lists"], 1024, True), - (INPUT_TEXTS["english_lists"], 1024, False), - (INPUT_TEXTS["python_code"], 64, True), - (INPUT_TEXTS["python_code"], 64, False), - (INPUT_TEXTS["python_code"], 256, True), - (INPUT_TEXTS["python_code"], 256, False), - (INPUT_TEXTS["python_code"], 1024, True), - (INPUT_TEXTS["python_code"], 1024, False), - (INPUT_TEXTS["chinese_text"], 64, True), - (INPUT_TEXTS["chinese_text"], 64, False), - (INPUT_TEXTS["chinese_text"], 256, True), - (INPUT_TEXTS["chinese_text"], 256, False), - (INPUT_TEXTS["chinese_text"], 1024, True), - (INPUT_TEXTS["chinese_text"], 1024, False), - ], + list(product(list(INPUT_TEXTS.values()), paragraph_lengths, batch_paragraphs_vals)), ) def test_paragraph_chunk_length(input_text, paragraph_length, batch_paragraphs): chunks = list(chunk_by_paragraph(input_text, paragraph_length, batch_paragraphs)) diff --git a/cognee/tests/unit/processing/chunks/chunk_by_sentence_test.py b/cognee/tests/unit/processing/chunks/chunk_by_sentence_test.py index 45af2ed39..2f42f836a 100644 --- a/cognee/tests/unit/processing/chunks/chunk_by_sentence_test.py +++ b/cognee/tests/unit/processing/chunks/chunk_by_sentence_test.py @@ -1,26 +1,17 @@ +from itertools import product + import numpy as np import pytest from cognee.tasks.chunks import chunk_by_sentence, chunk_by_word from cognee.tests.unit.processing.chunks.test_input import INPUT_TEXTS +maximum_length_vals = [None, 8, 64] + @pytest.mark.parametrize( "input_text,maximum_length", - [ - (INPUT_TEXTS["english_text"], None), - (INPUT_TEXTS["english_text"], 8), - (INPUT_TEXTS["english_text"], 64), - (INPUT_TEXTS["english_lists"], None), - (INPUT_TEXTS["english_lists"], 8), - (INPUT_TEXTS["english_lists"], 64), - (INPUT_TEXTS["python_code"], None), - (INPUT_TEXTS["python_code"], 8), - (INPUT_TEXTS["python_code"], 64), - (INPUT_TEXTS["chinese_text"], None), - (INPUT_TEXTS["chinese_text"], 8), - (INPUT_TEXTS["chinese_text"], 64), - ], + list(product(list(INPUT_TEXTS.values()), maximum_length_vals)), ) def test_chunk_by_sentence_isomorphism(input_text, maximum_length): chunks = chunk_by_sentence(input_text, maximum_length) @@ -32,16 +23,12 @@ def test_chunk_by_sentence_isomorphism(input_text, maximum_length): @pytest.mark.parametrize( "input_text,maximum_length", - [ - (INPUT_TEXTS["english_text"], 8), - (INPUT_TEXTS["english_text"], 64), - (INPUT_TEXTS["english_lists"], 8), - (INPUT_TEXTS["english_lists"], 64), - (INPUT_TEXTS["python_code"], 8), - (INPUT_TEXTS["python_code"], 64), - (INPUT_TEXTS["chinese_text"], 8), - (INPUT_TEXTS["chinese_text"], 64), - ], + list( + product( + list(INPUT_TEXTS.values()), + [val for val in maximum_length_vals if val is not None], + ) + ), ) def test_paragraph_chunk_length(input_text, maximum_length): chunks = list(chunk_by_sentence(input_text, maximum_length)) diff --git a/cognee/tests/unit/processing/chunks/test_input.py b/cognee/tests/unit/processing/chunks/test_input.py index b7a57b75a..820bf2d2d 100644 --- a/cognee/tests/unit/processing/chunks/test_input.py +++ b/cognee/tests/unit/processing/chunks/test_input.py @@ -272,4 +272,12 @@ Vows made in pain, as violent and void. For never can true reconcilement grow Where wounds of deadly hate have peirc'd so deep: Which would but lead me to a worse relapse [ 100 ]""", + "empty": "", + "single_char": "x", + "whitespace": " \n\t \r\n ", + "unicode_special": "Hello 👋 مرحبا שָׁלוֹם", + "mixed_endings": "line1\r\nline2\nline3\r\nline4", + "many_newlines": "\n\n\n\ntext\n\n\n\n", + "html_mixed": "

Hello

\nPlain text\n
World
", + "urls_emails": "Visit https://example.com or email user@example.com", } From 45a60b7f1963b9675a4d7812a6e33311f469d519 Mon Sep 17 00:00:00 2001 From: Leon Luithlen Date: Wed, 13 Nov 2024 16:35:47 +0100 Subject: [PATCH 15/39] Remove assert and move is_real_paragraph_end outside loop --- cognee/tasks/chunks/chunk_by_paragraph.py | 1 - cognee/tasks/chunks/chunk_by_word.py | 26 +++++++++++------------ 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/cognee/tasks/chunks/chunk_by_paragraph.py b/cognee/tasks/chunks/chunk_by_paragraph.py index 11ab8dd41..f960eb028 100644 --- a/cognee/tasks/chunks/chunk_by_paragraph.py +++ b/cognee/tasks/chunks/chunk_by_paragraph.py @@ -14,7 +14,6 @@ def chunk_by_paragraph(data: str, paragraph_length: int = 1024, batch_paragraphs last_cut_type = None for paragraph_id, _, sentence, word_count, end_type in chunk_by_sentence(data, maximum_length=paragraph_length): - assert word_count <= paragraph_length, f"{paragraph_length = } is smaller than {word_count = }" # Check if this sentence would exceed length limit if current_word_count > 0 and current_word_count + word_count > paragraph_length: # Yield current chunk diff --git a/cognee/tasks/chunks/chunk_by_word.py b/cognee/tasks/chunks/chunk_by_word.py index 7ebf4bced..120c759e6 100644 --- a/cognee/tasks/chunks/chunk_by_word.py +++ b/cognee/tasks/chunks/chunk_by_word.py @@ -3,6 +3,18 @@ import re SENTENCE_ENDINGS = r"[.;!?…]" PARAGRAPH_ENDINGS = r"[\n\r]" +def is_real_paragraph_end(last_processed_character, i, data): + if re.match(SENTENCE_ENDINGS, last_processed_character): + return True + j = i + 1 + next_character = data[j] if j < len(data) else None + while next_character is not None and (re.match(PARAGRAPH_ENDINGS, next_character) or next_character == " "): + j += 1 + next_character = data[j] if j < len(data) else None + if next_character and next_character.isupper(): + return True + return False + def chunk_by_word(data: str): """ Chunks text into words and endings while preserving whitespace. @@ -24,23 +36,11 @@ def chunk_by_word(data: str): while i < len(data): character = data[i] - def is_real_paragraph_end(): - if re.match(SENTENCE_ENDINGS, last_processed_character): - return True - j = i + 1 - next_character = data[j] if j < len(data) else None - while next_character is not None and (re.match(PARAGRAPH_ENDINGS, next_character) or next_character == " "): - j += 1 - next_character = data[j] if j < len(data) else None - if next_character and next_character.isupper(): - return True - return False - if re.match(PARAGRAPH_ENDINGS, character): if current_chunk: yield (current_chunk, "word") current_chunk = "" - yield (character, "paragraph_end" if is_real_paragraph_end() else "word") + yield (character, "paragraph_end" if is_real_paragraph_end(last_processed_character, i, data) else "word") i += 1 continue From d90698305bb3d1884577749a7408d7376e6149b3 Mon Sep 17 00:00:00 2001 From: Leon Luithlen Date: Thu, 14 Nov 2024 09:43:10 +0100 Subject: [PATCH 16/39] Simplify chunk_by_word --- cognee/tasks/chunks/chunk_by_word.py | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/cognee/tasks/chunks/chunk_by_word.py b/cognee/tasks/chunks/chunk_by_word.py index 120c759e6..a93f9acdb 100644 --- a/cognee/tasks/chunks/chunk_by_word.py +++ b/cognee/tasks/chunks/chunk_by_word.py @@ -25,14 +25,6 @@ def chunk_by_word(data: str): current_chunk = "" i = 0 - # Handle leading whitespace if any - while i < len(data) and (re.match(PARAGRAPH_ENDINGS, data[i]) or data[i] == " "): - current_chunk += data[i] - i += 1 - if current_chunk: - yield (current_chunk, "word") - current_chunk = "" - while i < len(data): character = data[i] @@ -53,12 +45,7 @@ def chunk_by_word(data: str): i += 1 continue - if re.match(SENTENCE_ENDINGS, character): - # Check for ellipses - if i + 2 < len(data) and data[i:i+3] == "...": - current_chunk += ".." - i += 2 - + if re.match(SENTENCE_ENDINGS, character): # Look ahead for whitespace next_i = i + 1 while next_i < len(data) and data[next_i] == " ": From adc8a0b09cc61427e01e3cec17bc58b0d25854c1 Mon Sep 17 00:00:00 2001 From: Leon Luithlen Date: Thu, 14 Nov 2024 09:43:32 +0100 Subject: [PATCH 17/39] Add ellipsis test string --- cognee/tests/unit/processing/chunks/test_input.py | 1 + 1 file changed, 1 insertion(+) diff --git a/cognee/tests/unit/processing/chunks/test_input.py b/cognee/tests/unit/processing/chunks/test_input.py index 820bf2d2d..4afbab70d 100644 --- a/cognee/tests/unit/processing/chunks/test_input.py +++ b/cognee/tests/unit/processing/chunks/test_input.py @@ -280,4 +280,5 @@ Which would but lead me to a worse relapse [ 100 ]""", "many_newlines": "\n\n\n\ntext\n\n\n\n", "html_mixed": "

Hello

\nPlain text\n
World
", "urls_emails": "Visit https://example.com or email user@example.com", + "elipses": "Hello...How are you…", } From e794bb88346d4ad6a2c16752026b18d053b2993d Mon Sep 17 00:00:00 2001 From: Leon Luithlen Date: Thu, 14 Nov 2024 09:43:56 +0100 Subject: [PATCH 18/39] Return stripped value from get_embeddable_data if its string --- cognee/infrastructure/engine/models/DataPoint.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/cognee/infrastructure/engine/models/DataPoint.py b/cognee/infrastructure/engine/models/DataPoint.py index 222b11ad7..337306cb6 100644 --- a/cognee/infrastructure/engine/models/DataPoint.py +++ b/cognee/infrastructure/engine/models/DataPoint.py @@ -20,5 +20,8 @@ class DataPoint(BaseModel): def get_embeddable_data(self): if self._metadata and len(self._metadata["index_fields"]) > 0 \ and hasattr(self, self._metadata["index_fields"][0]): - - return getattr(self, self._metadata["index_fields"][0]) + attribute = getattr(self, self._metadata["index_fields"][0]) + if isinstance(attribute, str): + return(attribute.strip()) + else: + return (attribute) From 8afb25e0d4cdd41d159fd398212e69c1dc48d918 Mon Sep 17 00:00:00 2001 From: Leon Luithlen Date: Thu, 14 Nov 2024 11:24:11 +0100 Subject: [PATCH 19/39] Move PdfDocument_test.py to integration tests --- .DS_Store | Bin 6148 -> 0 bytes .../documents/PdfDocument_test.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) delete mode 100644 .DS_Store rename cognee/tests/{unit => integration}/documents/PdfDocument_test.py (100%) diff --git a/.DS_Store b/.DS_Store deleted file mode 100644 index c53aac86a09d129220b9f904fda3e88a43eb0059..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHKISv9b477m)L_?wu z8Wo@dRDcRl0V?oC1+usf$6q{?M^OPP@E;1;{ZQbBHL(r!s{@0#0KfslZkT&70W1~( z*2Fdt5ts%Q7*x#`LxYZZ$-J7_1_oU;n-9%5Yj!B=Z^!w?(?x3_M=C%CUKQxaa$@y= z3IEXlzmmA30#x9y6wtw9wV30RvbJ_U&T4Ieui=(+hnr#U6bxRDfnJWWuyQ>0q{u5a Y$9_$01D%ez(}DaMFkNU=;MWQ~01mYkvj6}9 diff --git a/cognee/tests/unit/documents/PdfDocument_test.py b/cognee/tests/integration/documents/PdfDocument_test.py similarity index 100% rename from cognee/tests/unit/documents/PdfDocument_test.py rename to cognee/tests/integration/documents/PdfDocument_test.py From e6636754ff91e0933f16f47d6c2af6cc7c493467 Mon Sep 17 00:00:00 2001 From: Leon Luithlen Date: Thu, 14 Nov 2024 11:39:14 +0100 Subject: [PATCH 20/39] Add TextDocument_test.py --- .../integration/documents/PdfDocument_test.py | 4 +- .../documents/TextDocument_test.py | 46 +++++++++++++++++++ 2 files changed, 48 insertions(+), 2 deletions(-) create mode 100644 cognee/tests/integration/documents/TextDocument_test.py diff --git a/cognee/tests/integration/documents/PdfDocument_test.py b/cognee/tests/integration/documents/PdfDocument_test.py index 108d61273..d8ddbe23c 100644 --- a/cognee/tests/integration/documents/PdfDocument_test.py +++ b/cognee/tests/integration/documents/PdfDocument_test.py @@ -16,12 +16,12 @@ def test_PdfDocument(): "test_data", "artificial-intelligence.pdf", ) - pdf_doc = PdfDocument( + document = PdfDocument( id=uuid.uuid4(), name="Test document.pdf", raw_data_location=test_file_path ) for ground_truth, paragraph_data in zip( - GROUND_TRUTH, pdf_doc.read(chunk_size=1024) + GROUND_TRUTH, document.read(chunk_size=1024) ): assert ( ground_truth["word_count"] == paragraph_data.word_count diff --git a/cognee/tests/integration/documents/TextDocument_test.py b/cognee/tests/integration/documents/TextDocument_test.py new file mode 100644 index 000000000..1547baa46 --- /dev/null +++ b/cognee/tests/integration/documents/TextDocument_test.py @@ -0,0 +1,46 @@ +import os +import uuid + +import pytest + +from cognee.modules.data.processing.document_types.TextDocument import TextDocument + +GROUND_TRUTH = { + "code.txt": [ + {"word_count": 253, "len_text": 953, "cut_type": "paragraph_end"}, + {"word_count": 157, "len_text": 905, "cut_type": "paragraph_end"}, + ], + "Natural_language_processing.txt": [ + {"word_count": 115, "len_text": 839, "cut_type": "paragraph_end"}, + {"word_count": 15, "len_text": 146, "cut_type": "paragraph_end"}, + ], +} + + +@pytest.mark.parametrize( + "input_file,chunk_size", + [("code.txt", 256), ("Natural_language_processing.txt", 128)], +) +def test_TextDocument(input_file, chunk_size): + test_file_path = os.path.join( + os.sep, + *(os.path.dirname(__file__).split(os.sep)[:-2]), + "test_data", + input_file, + ) + document = TextDocument( + id=uuid.uuid4(), name=input_file, raw_data_location=test_file_path + ) + + for ground_truth, paragraph_data in zip( + GROUND_TRUTH[input_file], document.read(chunk_size=chunk_size) + ): + assert ( + ground_truth["word_count"] == paragraph_data.word_count + ), f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }' + assert ground_truth["len_text"] == len( + paragraph_data.text + ), f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }' + assert ( + ground_truth["cut_type"] == paragraph_data.cut_type + ), f'{ground_truth["cut_type"] = } != {paragraph_data.cut_type = }' From c905510f305432a04eac32474753fca2d992396f Mon Sep 17 00:00:00 2001 From: Leon Luithlen Date: Thu, 14 Nov 2024 11:44:18 +0100 Subject: [PATCH 21/39] Change test_input order --- .../tests/unit/processing/chunks/test_input.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/cognee/tests/unit/processing/chunks/test_input.py b/cognee/tests/unit/processing/chunks/test_input.py index 4afbab70d..21a0e3165 100644 --- a/cognee/tests/unit/processing/chunks/test_input.py +++ b/cognee/tests/unit/processing/chunks/test_input.py @@ -1,6 +1,15 @@ import pytest INPUT_TEXTS = { + "empty": "", + "single_char": "x", + "whitespace": " \n\t \r\n ", + "unicode_special": "Hello 👋 مرحبا שָׁלוֹם", + "mixed_endings": "line1\r\nline2\nline3\r\nline4", + "many_newlines": "\n\n\n\ntext\n\n\n\n", + "html_mixed": "

Hello

\nPlain text\n
World
", + "urls_emails": "Visit https://example.com or email user@example.com", + "elipses": "Hello...How are you…", "english_lists": """Let me think through the key attributes that would be important to test in a text chunking system. Here are the essential attributes to test: @@ -272,13 +281,4 @@ Vows made in pain, as violent and void. For never can true reconcilement grow Where wounds of deadly hate have peirc'd so deep: Which would but lead me to a worse relapse [ 100 ]""", - "empty": "", - "single_char": "x", - "whitespace": " \n\t \r\n ", - "unicode_special": "Hello 👋 مرحبا שָׁלוֹם", - "mixed_endings": "line1\r\nline2\nline3\r\nline4", - "many_newlines": "\n\n\n\ntext\n\n\n\n", - "html_mixed": "

Hello

\nPlain text\n
World
", - "urls_emails": "Visit https://example.com or email user@example.com", - "elipses": "Hello...How are you…", } From 8b3b2f8156844ea3a10e729cf202ec24fb9b6f45 Mon Sep 17 00:00:00 2001 From: Leon Luithlen Date: Thu, 14 Nov 2024 11:59:46 +0100 Subject: [PATCH 22/39] Add transcribe_image and create_transcript methods --- .../data/processing/document_types/AudioDocument.py | 8 ++++++-- .../data/processing/document_types/ImageDocument.py | 8 ++++++-- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/cognee/modules/data/processing/document_types/AudioDocument.py b/cognee/modules/data/processing/document_types/AudioDocument.py index d3ae0974d..989c881a1 100644 --- a/cognee/modules/data/processing/document_types/AudioDocument.py +++ b/cognee/modules/data/processing/document_types/AudioDocument.py @@ -5,10 +5,14 @@ from .Document import Document class AudioDocument(Document): type: str = "audio" + def create_transcript(self): + result = get_llm_client().create_transcript(self.raw_data_location) + return(result.text) + def read(self, chunk_size: int): # Transcribe the audio file - result = get_llm_client().create_transcript(self.raw_data_location) - text = result.text + + text = self.create_transcript() chunker = TextChunker(self, chunk_size = chunk_size, get_text = lambda: text) diff --git a/cognee/modules/data/processing/document_types/ImageDocument.py b/cognee/modules/data/processing/document_types/ImageDocument.py index 5571b3bd8..7338217a5 100644 --- a/cognee/modules/data/processing/document_types/ImageDocument.py +++ b/cognee/modules/data/processing/document_types/ImageDocument.py @@ -5,10 +5,14 @@ from .Document import Document class ImageDocument(Document): type: str = "image" + + def transcribe_image(self): + result = get_llm_client().transcribe_image(self.raw_data_location) + return(result.choices[0].message.content) + def read(self, chunk_size: int): # Transcribe the image file - result = get_llm_client().transcribe_image(self.raw_data_location) - text = result.choices[0].message.content + text = self.transcribe_image() chunker = TextChunker(self, chunk_size = chunk_size, get_text = lambda: text) From f87fd12e9b448fb3c009a307406ebe9213322967 Mon Sep 17 00:00:00 2001 From: Leon Luithlen Date: Thu, 14 Nov 2024 12:41:47 +0100 Subject: [PATCH 23/39] Fix lambda bug in AudioDocument and ImageDocument --- cognee/modules/data/processing/document_types/AudioDocument.py | 2 +- cognee/modules/data/processing/document_types/ImageDocument.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cognee/modules/data/processing/document_types/AudioDocument.py b/cognee/modules/data/processing/document_types/AudioDocument.py index 989c881a1..0d2cddd3d 100644 --- a/cognee/modules/data/processing/document_types/AudioDocument.py +++ b/cognee/modules/data/processing/document_types/AudioDocument.py @@ -14,6 +14,6 @@ class AudioDocument(Document): text = self.create_transcript() - chunker = TextChunker(self, chunk_size = chunk_size, get_text = lambda: text) + chunker = TextChunker(self, chunk_size = chunk_size, get_text = lambda: [text]) yield from chunker.read() diff --git a/cognee/modules/data/processing/document_types/ImageDocument.py b/cognee/modules/data/processing/document_types/ImageDocument.py index 7338217a5..e8f0dd8ee 100644 --- a/cognee/modules/data/processing/document_types/ImageDocument.py +++ b/cognee/modules/data/processing/document_types/ImageDocument.py @@ -14,6 +14,6 @@ class ImageDocument(Document): # Transcribe the image file text = self.transcribe_image() - chunker = TextChunker(self, chunk_size = chunk_size, get_text = lambda: text) + chunker = TextChunker(self, chunk_size = chunk_size, get_text = lambda: [text]) yield from chunker.read() From 82606474971f6b776f157c7d55ae8ae00d677272 Mon Sep 17 00:00:00 2001 From: Leon Luithlen Date: Thu, 14 Nov 2024 12:42:10 +0100 Subject: [PATCH 24/39] Add AudioDocument and ImageDocument tests --- .../documents/AudioDocument_test.py | 46 +++++++++++++++++++ .../documents/ImageDocument_test.py | 34 ++++++++++++++ 2 files changed, 80 insertions(+) create mode 100644 cognee/tests/integration/documents/AudioDocument_test.py create mode 100644 cognee/tests/integration/documents/ImageDocument_test.py diff --git a/cognee/tests/integration/documents/AudioDocument_test.py b/cognee/tests/integration/documents/AudioDocument_test.py new file mode 100644 index 000000000..b20124456 --- /dev/null +++ b/cognee/tests/integration/documents/AudioDocument_test.py @@ -0,0 +1,46 @@ +import uuid +from unittest.mock import patch + +from cognee.modules.data.processing.document_types.AudioDocument import AudioDocument + +GROUND_TRUTH = [ + {"word_count": 60, "len_text": 318, "cut_type": "sentence_end"}, + {"word_count": 64, "len_text": 358, "cut_type": "sentence_end"}, + {"word_count": 56, "len_text": 255, "cut_type": "sentence_cut"}, +] + +TEST_TEXT = """ +"Mike, we need to talk about the payment processing service." +"Good timing. The board wants one-click checkout by end of quarter." +"That's exactly the problem. The service is held together with duct tape. One wrong move and—" +"Sarah, we've been over this. The market won't wait." +"And neither will a system collapse! The technical debt is crushing us. Every new feature takes twice as long as it should." +"Then work twice as hard. Our competitors—" +"Our competitors will laugh when our whole system goes down during Black Friday! We're talking about financial transactions here, not some blog comments section." +"Write up your concerns in a doc. Right now, we ship one-click." +"Then you'll ship it without me. I won't stake my reputation on a house of cards." +"Are you threatening to quit?" +"No, I'm threatening to be right. And when it breaks, I want it in writing that you chose this." +"The feature ships, Sarah. That's final." +""" + + +def test_AudioDocument(): + + document = AudioDocument( + id=uuid.uuid4(), name="audio-dummy-test", raw_data_location="" + ) + with patch.object(AudioDocument, "create_transcript", return_value=TEST_TEXT): + + for ground_truth, paragraph_data in zip( + GROUND_TRUTH, document.read(chunk_size=64) + ): + assert ( + ground_truth["word_count"] == paragraph_data.word_count + ), f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }' + assert ground_truth["len_text"] == len( + paragraph_data.text + ), f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }' + assert ( + ground_truth["cut_type"] == paragraph_data.cut_type + ), f'{ground_truth["cut_type"] = } != {paragraph_data.cut_type = }' diff --git a/cognee/tests/integration/documents/ImageDocument_test.py b/cognee/tests/integration/documents/ImageDocument_test.py new file mode 100644 index 000000000..d34127eb3 --- /dev/null +++ b/cognee/tests/integration/documents/ImageDocument_test.py @@ -0,0 +1,34 @@ +import uuid +from unittest.mock import patch + +from cognee.modules.data.processing.document_types.ImageDocument import ImageDocument + +GROUND_TRUTH = [ + {"word_count": 51, "len_text": 298, "cut_type": "sentence_end"}, + {"word_count": 63, "len_text": 369, "cut_type": "sentence_end"}, + {"word_count": 44, "len_text": 294, "cut_type": "sentence_end"}, +] + +TEST_TEXT = """A dramatic confrontation unfolds as a red fox and river otter engage in an energetic wrestling match at the water's edge. The fox, teeth bared in a playful snarl, has its front paws locked with the otter's flippers as they roll through the shallow stream, sending water spraying in all directions. The otter, displaying its surprising agility on land, counters by twisting its sleek body and attempting to wrap itself around the fox's shoulders, its whiskered face inches from the fox's muzzle. +The commotion has attracted an audience: a murder of crows has gathered in the low branches, their harsh calls adding to the chaos as they hop excitedly from limb to limb. One particularly bold crow dive-bombs the wrestling pair, causing both animals to momentarily freeze mid-tussle, creating a perfect snapshot of suspended action—the fox's fur dripping wet, the otter's body coiled like a spring, and the crow's wings spread wide against the golden morning light.""" + + +def test_ImageDocument(): + + document = ImageDocument( + id=uuid.uuid4(), name="image-dummy-test", raw_data_location="" + ) + with patch.object(ImageDocument, "transcribe_image", return_value=TEST_TEXT): + + for ground_truth, paragraph_data in zip( + GROUND_TRUTH, document.read(chunk_size=64) + ): + assert ( + ground_truth["word_count"] == paragraph_data.word_count + ), f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }' + assert ground_truth["len_text"] == len( + paragraph_data.text + ), f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }' + assert ( + ground_truth["cut_type"] == paragraph_data.cut_type + ), f'{ground_truth["cut_type"] = } != {paragraph_data.cut_type = }' From f2206a09c0c5a3b87fd772fbffc2f9fc1dcf7dda Mon Sep 17 00:00:00 2001 From: 0xideas Date: Thu, 14 Nov 2024 13:16:17 +0100 Subject: [PATCH 25/39] Update cognee/tasks/chunks/chunk_by_word.py Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> --- cognee/tasks/chunks/chunk_by_word.py | 31 +++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/cognee/tasks/chunks/chunk_by_word.py b/cognee/tasks/chunks/chunk_by_word.py index a93f9acdb..45f11f1c2 100644 --- a/cognee/tasks/chunks/chunk_by_word.py +++ b/cognee/tasks/chunks/chunk_by_word.py @@ -3,15 +3,32 @@ import re SENTENCE_ENDINGS = r"[.;!?…]" PARAGRAPH_ENDINGS = r"[\n\r]" -def is_real_paragraph_end(last_processed_character, i, data): - if re.match(SENTENCE_ENDINGS, last_processed_character): +def is_real_paragraph_end(last_char: str, current_pos: int, text: str) -> bool: + """ + Determines if the current position represents a real paragraph ending. + + Args: + last_char: The last processed character + current_pos: Current position in the text + text: The input text + + Returns: + bool: True if this is a real paragraph end, False otherwise + """ + if re.match(SENTENCE_ENDINGS, last_char): return True - j = i + 1 - next_character = data[j] if j < len(data) else None - while next_character is not None and (re.match(PARAGRAPH_ENDINGS, next_character) or next_character == " "): + j = current_pos + 1 + if j >= len(text): + return False + + next_character = text[j] + while j < len(text) and (re.match(PARAGRAPH_ENDINGS, next_character) or next_character == " "): j += 1 - next_character = data[j] if j < len(data) else None - if next_character and next_character.isupper(): + if j >= len(text): + return False + next_character = text[j] + + if next_character.isupper(): return True return False From 6721eaee83aac51594cee2202d1c3a8957b5463d Mon Sep 17 00:00:00 2001 From: Leon Luithlen Date: Thu, 14 Nov 2024 13:50:40 +0100 Subject: [PATCH 26/39] Fix chunk_index bug in chunk_by_paragraph --- cognee/tasks/chunks/chunk_by_paragraph.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cognee/tasks/chunks/chunk_by_paragraph.py b/cognee/tasks/chunks/chunk_by_paragraph.py index f960eb028..276da0bf1 100644 --- a/cognee/tasks/chunks/chunk_by_paragraph.py +++ b/cognee/tasks/chunks/chunk_by_paragraph.py @@ -53,9 +53,9 @@ def chunk_by_paragraph(data: str, paragraph_length: int = 1024, batch_paragraphs "cut_type": end_type } yield chunk_dict + chunk_index += 1 current_chunk = "" current_word_count = 0 - chunk_index = 0 last_cut_type = end_type last_paragraph_id = paragraph_id From 57d8149732fd4339f578b2487fe9b5880ec83e33 Mon Sep 17 00:00:00 2001 From: Leon Luithlen Date: Thu, 14 Nov 2024 13:59:54 +0100 Subject: [PATCH 27/39] Save paragraph_ids in chunk_by_paragraph --- cognee/tasks/chunks/chunk_by_paragraph.py | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/cognee/tasks/chunks/chunk_by_paragraph.py b/cognee/tasks/chunks/chunk_by_paragraph.py index 276da0bf1..a85a2de26 100644 --- a/cognee/tasks/chunks/chunk_by_paragraph.py +++ b/cognee/tasks/chunks/chunk_by_paragraph.py @@ -10,29 +10,28 @@ def chunk_by_paragraph(data: str, paragraph_length: int = 1024, batch_paragraphs current_chunk = "" current_word_count = 0 chunk_index = 0 - last_paragraph_id = None + paragraph_ids = [] last_cut_type = None for paragraph_id, _, sentence, word_count, end_type in chunk_by_sentence(data, maximum_length=paragraph_length): # Check if this sentence would exceed length limit + paragraph_ids.append(paragraph_id) + if current_word_count > 0 and current_word_count + word_count > paragraph_length: # Yield current chunk chunk_dict = { "text": current_chunk, "word_count": current_word_count, "chunk_id": uuid5(NAMESPACE_OID, current_chunk), + "paragraph_ids": paragraph_ids, "chunk_index": chunk_index, - "cut_type": last_cut_type + "cut_type": last_cut_type, } - - if batch_paragraphs: - chunk_dict["id"] = chunk_dict["chunk_id"] - else: - chunk_dict["id"] = last_paragraph_id yield chunk_dict # Start new chunk with current sentence + paragraph_ids = [] current_chunk = sentence current_word_count = word_count chunk_index += 1 @@ -47,15 +46,16 @@ def chunk_by_paragraph(data: str, paragraph_length: int = 1024, batch_paragraphs chunk_dict = { "text": current_chunk, "word_count": current_word_count, - "id": paragraph_id, + "paragraph_ids": paragraph_ids, "chunk_id": uuid5(NAMESPACE_OID, current_chunk), "chunk_index": chunk_index, "cut_type": end_type } yield chunk_dict - chunk_index += 1 + paragraph_ids = [] current_chunk = "" current_word_count = 0 + chunk_index += 1 last_cut_type = end_type last_paragraph_id = paragraph_id @@ -66,13 +66,10 @@ def chunk_by_paragraph(data: str, paragraph_length: int = 1024, batch_paragraphs "text": current_chunk, "word_count": current_word_count, "chunk_id": uuid5(NAMESPACE_OID, current_chunk), + "paragrapg_ids": paragraph_ids, "chunk_index": chunk_index, "cut_type": last_cut_type } - if batch_paragraphs: - chunk_dict["id"] = chunk_dict["chunk_id"] - else: - chunk_dict["id"] = last_paragraph_id yield chunk_dict \ No newline at end of file From eaf9167fa170113ca6173a518d679ef5536647c4 Mon Sep 17 00:00:00 2001 From: Leon Luithlen Date: Thu, 14 Nov 2024 14:19:34 +0100 Subject: [PATCH 28/39] Change chunk_by_word to collect newlines in prior words --- cognee/tasks/chunks/chunk_by_word.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/cognee/tasks/chunks/chunk_by_word.py b/cognee/tasks/chunks/chunk_by_word.py index 45f11f1c2..ab4d8343e 100644 --- a/cognee/tasks/chunks/chunk_by_word.py +++ b/cognee/tasks/chunks/chunk_by_word.py @@ -38,23 +38,13 @@ def chunk_by_word(data: str): Whitespace is included with the preceding word. Outputs can be joined with "" to recreate the original input. """ - last_processed_character = "" current_chunk = "" i = 0 while i < len(data): character = data[i] - if re.match(PARAGRAPH_ENDINGS, character): - if current_chunk: - yield (current_chunk, "word") - current_chunk = "" - yield (character, "paragraph_end" if is_real_paragraph_end(last_processed_character, i, data) else "word") - i += 1 - continue - current_chunk += character - last_processed_character = character if character == " ": yield (current_chunk, "word") From a52d3ac6ba23835611575ac765f4b5757087625a Mon Sep 17 00:00:00 2001 From: Leon Luithlen Date: Thu, 14 Nov 2024 14:20:18 +0100 Subject: [PATCH 29/39] Change document test ground truth values for new chunk_by_word --- .../documents/AudioDocument_test.py | 6 +++--- .../documents/ImageDocument_test.py | 2 +- .../documents/TextDocument_test.py | 8 ++++---- .../chunks/chunk_by_paragraph_test.py | 20 +++++++++---------- 4 files changed, 18 insertions(+), 18 deletions(-) diff --git a/cognee/tests/integration/documents/AudioDocument_test.py b/cognee/tests/integration/documents/AudioDocument_test.py index b20124456..49ddfc92c 100644 --- a/cognee/tests/integration/documents/AudioDocument_test.py +++ b/cognee/tests/integration/documents/AudioDocument_test.py @@ -4,9 +4,9 @@ from unittest.mock import patch from cognee.modules.data.processing.document_types.AudioDocument import AudioDocument GROUND_TRUTH = [ - {"word_count": 60, "len_text": 318, "cut_type": "sentence_end"}, - {"word_count": 64, "len_text": 358, "cut_type": "sentence_end"}, - {"word_count": 56, "len_text": 255, "cut_type": "sentence_cut"}, + {"word_count": 57, "len_text": 353, "cut_type": "sentence_end"}, + {"word_count": 58, "len_text": 358, "cut_type": "sentence_end"}, + {"word_count": 41, "len_text": 220, "cut_type": "sentence_cut"}, ] TEST_TEXT = """ diff --git a/cognee/tests/integration/documents/ImageDocument_test.py b/cognee/tests/integration/documents/ImageDocument_test.py index d34127eb3..e9caf3634 100644 --- a/cognee/tests/integration/documents/ImageDocument_test.py +++ b/cognee/tests/integration/documents/ImageDocument_test.py @@ -5,7 +5,7 @@ from cognee.modules.data.processing.document_types.ImageDocument import ImageDoc GROUND_TRUTH = [ {"word_count": 51, "len_text": 298, "cut_type": "sentence_end"}, - {"word_count": 63, "len_text": 369, "cut_type": "sentence_end"}, + {"word_count": 62, "len_text": 369, "cut_type": "sentence_end"}, {"word_count": 44, "len_text": 294, "cut_type": "sentence_end"}, ] diff --git a/cognee/tests/integration/documents/TextDocument_test.py b/cognee/tests/integration/documents/TextDocument_test.py index 1547baa46..9816f0529 100644 --- a/cognee/tests/integration/documents/TextDocument_test.py +++ b/cognee/tests/integration/documents/TextDocument_test.py @@ -7,12 +7,12 @@ from cognee.modules.data.processing.document_types.TextDocument import TextDocum GROUND_TRUTH = { "code.txt": [ - {"word_count": 253, "len_text": 953, "cut_type": "paragraph_end"}, - {"word_count": 157, "len_text": 905, "cut_type": "paragraph_end"}, + {"word_count": 205, "len_text": 1024, "cut_type": "sentence_cut"}, + {"word_count": 104, "len_text": 833, "cut_type": "sentence_cut"}, ], "Natural_language_processing.txt": [ - {"word_count": 115, "len_text": 839, "cut_type": "paragraph_end"}, - {"word_count": 15, "len_text": 146, "cut_type": "paragraph_end"}, + {"word_count": 128, "len_text": 984, "cut_type": "paragraph_end"}, + {"word_count": 1, "len_text": 1, "cut_type": "sentence_cut"}, ], } diff --git a/cognee/tests/unit/processing/chunks/chunk_by_paragraph_test.py b/cognee/tests/unit/processing/chunks/chunk_by_paragraph_test.py index f8fe00237..5355411d5 100644 --- a/cognee/tests/unit/processing/chunks/chunk_by_paragraph_test.py +++ b/cognee/tests/unit/processing/chunks/chunk_by_paragraph_test.py @@ -3,34 +3,34 @@ from cognee.tasks.chunks import chunk_by_paragraph GROUND_TRUTH = { "whole_text": [ { - "text": "This is example text. It contains multiple sentences.\n", - "word_count": 9, + "text": "This is example text. It contains multiple sentences.", + "word_count": 8, "cut_type": "paragraph_end", }, { - "text": "This is a second paragraph. First two paragraphs are whole.\n", - "word_count": 11, + "text": "\nThis is a second paragraph. First two paragraphs are whole.", + "word_count": 10, "cut_type": "paragraph_end", }, { - "text": "Third paragraph is a bit longer and is finished with a dot.", + "text": "\nThird paragraph is a bit longer and is finished with a dot.", "word_count": 12, "cut_type": "sentence_end", }, ], "cut_text": [ { - "text": "This is example text. It contains multiple sentences.\n", - "word_count": 9, + "text": "This is example text. It contains multiple sentences.", + "word_count": 8, "cut_type": "paragraph_end", }, { - "text": "This is a second paragraph. First two paragraphs are whole.\n", - "word_count": 11, + "text": "\nThis is a second paragraph. First two paragraphs are whole.", + "word_count": 10, "cut_type": "paragraph_end", }, { - "text": "Third paragraph is cut and is missing the dot at the end", + "text": "\nThird paragraph is cut and is missing the dot at the end", "word_count": 12, "cut_type": "word", }, From b4d509e68284cc1fef26218ab02ed0612b146a4f Mon Sep 17 00:00:00 2001 From: Leon Luithlen Date: Thu, 14 Nov 2024 14:23:09 +0100 Subject: [PATCH 30/39] Set batch_paragraph=False in run_chunking_test --- cognee/tests/unit/processing/chunks/chunk_by_paragraph_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cognee/tests/unit/processing/chunks/chunk_by_paragraph_test.py b/cognee/tests/unit/processing/chunks/chunk_by_paragraph_test.py index 5355411d5..55eeb7fae 100644 --- a/cognee/tests/unit/processing/chunks/chunk_by_paragraph_test.py +++ b/cognee/tests/unit/processing/chunks/chunk_by_paragraph_test.py @@ -49,7 +49,7 @@ Third paragraph is cut and is missing the dot at the end""", def run_chunking_test(test_text, expected_chunks): chunks = [] - for chunk_data in chunk_by_paragraph(test_text, 12, batch_paragraphs=True): + for chunk_data in chunk_by_paragraph(test_text, 12, batch_paragraphs=False): chunks.append(chunk_data) assert len(chunks) == 3 From 73f24f9e4db9a94522f3e133791f36a22d41b93a Mon Sep 17 00:00:00 2001 From: Leon Luithlen Date: Thu, 14 Nov 2024 14:40:42 +0100 Subject: [PATCH 31/39] Fix sentence_cut return value in inappropriate places --- cognee/tasks/chunks/chunk_by_paragraph.py | 1 - cognee/tasks/chunks/chunk_by_sentence.py | 16 ++++++++++++++-- .../integration/documents/AudioDocument_test.py | 7 +++---- .../integration/documents/TextDocument_test.py | 4 ++-- 4 files changed, 19 insertions(+), 9 deletions(-) diff --git a/cognee/tasks/chunks/chunk_by_paragraph.py b/cognee/tasks/chunks/chunk_by_paragraph.py index a85a2de26..cfe73471a 100644 --- a/cognee/tasks/chunks/chunk_by_paragraph.py +++ b/cognee/tasks/chunks/chunk_by_paragraph.py @@ -58,7 +58,6 @@ def chunk_by_paragraph(data: str, paragraph_length: int = 1024, batch_paragraphs chunk_index += 1 last_cut_type = end_type - last_paragraph_id = paragraph_id # Yield any remaining text if current_chunk: diff --git a/cognee/tasks/chunks/chunk_by_sentence.py b/cognee/tasks/chunks/chunk_by_sentence.py index 9159922af..fedc0c9b2 100644 --- a/cognee/tasks/chunks/chunk_by_sentence.py +++ b/cognee/tasks/chunks/chunk_by_sentence.py @@ -10,12 +10,24 @@ def chunk_by_sentence(data: str, maximum_length: Optional[int] = None): paragraph_id = uuid4() chunk_index = 0 word_count = 0 + section_end = False for (word, word_type) in chunk_by_word(data): sentence += word word_count += 1 - if word_type == "paragraph_end" or word_type == "sentence_end" or (maximum_length and (word_count == maximum_length)): + # this loop is to check if any letters come after a paragraph_end or sentence_end + # and if that is not the case, preserve the word_type for the final yield in the + # function + if word_type in ["paragraph_end", "sentence_end"]: + section_end = word_type + else: + for character in word: + if character.isalpha(): + section_end = "sentence_cut" + break + + if word_type in ["paragraph_end", "sentence_end"] or (maximum_length and (word_count == maximum_length)): yield (paragraph_id, chunk_index, sentence, word_count, word_type) sentence = "" word_count = 0 @@ -28,5 +40,5 @@ def chunk_by_sentence(data: str, maximum_length: Optional[int] = None): chunk_index, sentence, word_count, - "sentence_cut", + section_end, ) diff --git a/cognee/tests/integration/documents/AudioDocument_test.py b/cognee/tests/integration/documents/AudioDocument_test.py index 49ddfc92c..f4df19849 100644 --- a/cognee/tests/integration/documents/AudioDocument_test.py +++ b/cognee/tests/integration/documents/AudioDocument_test.py @@ -6,7 +6,7 @@ from cognee.modules.data.processing.document_types.AudioDocument import AudioDoc GROUND_TRUTH = [ {"word_count": 57, "len_text": 353, "cut_type": "sentence_end"}, {"word_count": 58, "len_text": 358, "cut_type": "sentence_end"}, - {"word_count": 41, "len_text": 220, "cut_type": "sentence_cut"}, + {"word_count": 41, "len_text": 219, "cut_type": "sentence_end"}, ] TEST_TEXT = """ @@ -21,8 +21,7 @@ TEST_TEXT = """ "Then you'll ship it without me. I won't stake my reputation on a house of cards." "Are you threatening to quit?" "No, I'm threatening to be right. And when it breaks, I want it in writing that you chose this." -"The feature ships, Sarah. That's final." -""" +"The feature ships, Sarah. That's final.\"""" def test_AudioDocument(): @@ -31,7 +30,7 @@ def test_AudioDocument(): id=uuid.uuid4(), name="audio-dummy-test", raw_data_location="" ) with patch.object(AudioDocument, "create_transcript", return_value=TEST_TEXT): - + l = list(document.read(chunk_size=64)) for ground_truth, paragraph_data in zip( GROUND_TRUTH, document.read(chunk_size=64) ): diff --git a/cognee/tests/integration/documents/TextDocument_test.py b/cognee/tests/integration/documents/TextDocument_test.py index 9816f0529..ef7d42272 100644 --- a/cognee/tests/integration/documents/TextDocument_test.py +++ b/cognee/tests/integration/documents/TextDocument_test.py @@ -8,11 +8,11 @@ from cognee.modules.data.processing.document_types.TextDocument import TextDocum GROUND_TRUTH = { "code.txt": [ {"word_count": 205, "len_text": 1024, "cut_type": "sentence_cut"}, - {"word_count": 104, "len_text": 833, "cut_type": "sentence_cut"}, + {"word_count": 104, "len_text": 833, "cut_type": "paragraph_end"}, ], "Natural_language_processing.txt": [ {"word_count": 128, "len_text": 984, "cut_type": "paragraph_end"}, - {"word_count": 1, "len_text": 1, "cut_type": "sentence_cut"}, + {"word_count": 1, "len_text": 1, "cut_type": "paragraph_end"}, ], } From 8b681529b1392f9e71c62fc7a1a380be0f6e5f26 Mon Sep 17 00:00:00 2001 From: 0xideas Date: Thu, 14 Nov 2024 14:42:15 +0100 Subject: [PATCH 32/39] Update cognee/tasks/chunks/chunk_by_paragraph.py Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> --- cognee/tasks/chunks/chunk_by_paragraph.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cognee/tasks/chunks/chunk_by_paragraph.py b/cognee/tasks/chunks/chunk_by_paragraph.py index cfe73471a..c6abd5b9d 100644 --- a/cognee/tasks/chunks/chunk_by_paragraph.py +++ b/cognee/tasks/chunks/chunk_by_paragraph.py @@ -65,7 +65,7 @@ def chunk_by_paragraph(data: str, paragraph_length: int = 1024, batch_paragraphs "text": current_chunk, "word_count": current_word_count, "chunk_id": uuid5(NAMESPACE_OID, current_chunk), - "paragrapg_ids": paragraph_ids, + "paragraph_ids": paragraph_ids, "chunk_index": chunk_index, "cut_type": last_cut_type } From d6a6a9eaba284c9100bf2c2bd3507833a8c216ee Mon Sep 17 00:00:00 2001 From: Leon Luithlen Date: Thu, 14 Nov 2024 15:03:09 +0100 Subject: [PATCH 33/39] Return sentence_cut instead of word in chunk_by_paragraph --- cognee/tasks/chunks/chunk_by_paragraph.py | 2 +- cognee/tests/unit/processing/chunks/chunk_by_paragraph_test.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cognee/tasks/chunks/chunk_by_paragraph.py b/cognee/tasks/chunks/chunk_by_paragraph.py index c6abd5b9d..a05451bd3 100644 --- a/cognee/tasks/chunks/chunk_by_paragraph.py +++ b/cognee/tasks/chunks/chunk_by_paragraph.py @@ -67,7 +67,7 @@ def chunk_by_paragraph(data: str, paragraph_length: int = 1024, batch_paragraphs "chunk_id": uuid5(NAMESPACE_OID, current_chunk), "paragraph_ids": paragraph_ids, "chunk_index": chunk_index, - "cut_type": last_cut_type + "cut_type": "sentence_cut" if last_cut_type == "word" else last_cut_type } diff --git a/cognee/tests/unit/processing/chunks/chunk_by_paragraph_test.py b/cognee/tests/unit/processing/chunks/chunk_by_paragraph_test.py index 55eeb7fae..3ddc6f4f5 100644 --- a/cognee/tests/unit/processing/chunks/chunk_by_paragraph_test.py +++ b/cognee/tests/unit/processing/chunks/chunk_by_paragraph_test.py @@ -32,7 +32,7 @@ GROUND_TRUTH = { { "text": "\nThird paragraph is cut and is missing the dot at the end", "word_count": 12, - "cut_type": "word", + "cut_type": "sentence_cut", }, ], } From 15420dd864f49911237f0f188d864ec724d4fe18 Mon Sep 17 00:00:00 2001 From: Leon Luithlen Date: Thu, 14 Nov 2024 16:47:51 +0100 Subject: [PATCH 34/39] Fix paragraph_ids handling --- cognee/tasks/chunks/chunk_by_paragraph.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/cognee/tasks/chunks/chunk_by_paragraph.py b/cognee/tasks/chunks/chunk_by_paragraph.py index a05451bd3..00bb5670c 100644 --- a/cognee/tasks/chunks/chunk_by_paragraph.py +++ b/cognee/tasks/chunks/chunk_by_paragraph.py @@ -13,10 +13,8 @@ def chunk_by_paragraph(data: str, paragraph_length: int = 1024, batch_paragraphs paragraph_ids = [] last_cut_type = None - for paragraph_id, _, sentence, word_count, end_type in chunk_by_sentence(data, maximum_length=paragraph_length): + for paragraph_id, sentence, word_count, end_type in chunk_by_sentence(data, maximum_length=paragraph_length): # Check if this sentence would exceed length limit - paragraph_ids.append(paragraph_id) - if current_word_count > 0 and current_word_count + word_count > paragraph_length: # Yield current chunk chunk_dict = { @@ -32,13 +30,13 @@ def chunk_by_paragraph(data: str, paragraph_length: int = 1024, batch_paragraphs # Start new chunk with current sentence paragraph_ids = [] - current_chunk = sentence - current_word_count = word_count + current_chunk = "" + current_word_count = 0 chunk_index += 1 - else: - # Just concatenate directly - no space handling - current_chunk += sentence - current_word_count += word_count + + paragraph_ids.append(paragraph_id) + current_chunk += sentence + current_word_count += word_count # Handle end of paragraph if end_type in ("paragraph_end", "sentence_cut") and not batch_paragraphs: From 84c98f16bb9501118dc856d7d7d83f3dc8ccefcd Mon Sep 17 00:00:00 2001 From: Leon Luithlen Date: Thu, 14 Nov 2024 16:49:13 +0100 Subject: [PATCH 35/39] Remove chunk_index attribute from chunk_by_sentence return value --- cognee/tasks/chunks/chunk_by_sentence.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/cognee/tasks/chunks/chunk_by_sentence.py b/cognee/tasks/chunks/chunk_by_sentence.py index fedc0c9b2..bee074d04 100644 --- a/cognee/tasks/chunks/chunk_by_sentence.py +++ b/cognee/tasks/chunks/chunk_by_sentence.py @@ -8,7 +8,6 @@ from .chunk_by_word import chunk_by_word def chunk_by_sentence(data: str, maximum_length: Optional[int] = None): sentence = "" paragraph_id = uuid4() - chunk_index = 0 word_count = 0 section_end = False @@ -28,16 +27,14 @@ def chunk_by_sentence(data: str, maximum_length: Optional[int] = None): break if word_type in ["paragraph_end", "sentence_end"] or (maximum_length and (word_count == maximum_length)): - yield (paragraph_id, chunk_index, sentence, word_count, word_type) + yield (paragraph_id, sentence, word_count, word_type) sentence = "" word_count = 0 paragraph_id = uuid4() if word_type == "paragraph_end" else paragraph_id - chunk_index = 0 if word_type == "paragraph_end" else chunk_index + 1 if len(sentence) > 0: yield ( paragraph_id, - chunk_index, sentence, word_count, section_end, From 928e1075c6c6c5c6d3fba8ac58ed347a81506c00 Mon Sep 17 00:00:00 2001 From: Leon Luithlen Date: Thu, 14 Nov 2024 16:55:24 +0100 Subject: [PATCH 36/39] Test chunk_by_paragraph chunk numbering --- .../graph/cognee_graph_elements_test.py | 24 ++++++++++++++++--- .../unit/modules/graph/cognee_graph_test.py | 10 +++++++- .../chunks/chunk_by_paragraph_2_test.py | 14 +++++++++++ 3 files changed, 44 insertions(+), 4 deletions(-) diff --git a/cognee/tests/unit/modules/graph/cognee_graph_elements_test.py b/cognee/tests/unit/modules/graph/cognee_graph_elements_test.py index 137b9f7e2..d2a1b6c59 100644 --- a/cognee/tests/unit/modules/graph/cognee_graph_elements_test.py +++ b/cognee/tests/unit/modules/graph/cognee_graph_elements_test.py @@ -1,7 +1,7 @@ -import pytest import numpy as np +import pytest -from cognee.modules.graph.cognee_graph.CogneeGraphElements import Node, Edge +from cognee.modules.graph.cognee_graph.CogneeGraphElements import Edge, Node def test_node_initialization(): @@ -12,11 +12,13 @@ def test_node_initialization(): assert len(node.status) == 2 assert np.all(node.status == 1) + def test_node_invalid_dimension(): """Test that initializing a Node with a non-positive dimension raises an error.""" with pytest.raises(ValueError, match="Dimension must be a positive integer"): Node("node1", dimension=0) + def test_add_skeleton_neighbor(): """Test adding a neighbor to a node.""" node1 = Node("node1") @@ -24,6 +26,7 @@ def test_add_skeleton_neighbor(): node1.add_skeleton_neighbor(node2) assert node2 in node1.skeleton_neighbours + def test_remove_skeleton_neighbor(): """Test removing a neighbor from a node.""" node1 = Node("node1") @@ -32,6 +35,7 @@ def test_remove_skeleton_neighbor(): node1.remove_skeleton_neighbor(node2) assert node2 not in node1.skeleton_neighbours + def test_add_skeleton_edge(): """Test adding an edge updates both skeleton_edges and skeleton_neighbours.""" node1 = Node("node1") @@ -41,6 +45,7 @@ def test_add_skeleton_edge(): assert edge in node1.skeleton_edges assert node2 in node1.skeleton_neighbours + def test_remove_skeleton_edge(): """Test removing an edge updates both skeleton_edges and skeleton_neighbours.""" node1 = Node("node1") @@ -51,6 +56,7 @@ def test_remove_skeleton_edge(): assert edge not in node1.skeleton_edges assert node2 not in node1.skeleton_neighbours + def test_is_node_alive_in_dimension(): """Test checking node's alive status in a specific dimension.""" node = Node("node1", dimension=2) @@ -58,25 +64,30 @@ def test_is_node_alive_in_dimension(): node.status[1] = 0 assert not node.is_node_alive_in_dimension(1) + def test_node_alive_invalid_dimension(): """Test that checking alive status with an invalid dimension raises an error.""" node = Node("node1", dimension=1) with pytest.raises(ValueError, match="Dimension 1 is out of range"): node.is_node_alive_in_dimension(1) + def test_node_equality(): """Test equality between nodes.""" node1 = Node("node1") node2 = Node("node1") assert node1 == node2 + def test_node_hash(): """Test hashing for Node.""" node = Node("node1") assert hash(node) == hash("node1") + ### Tests for Edge ### + def test_edge_initialization(): """Test that an Edge is initialized correctly.""" node1 = Node("node1") @@ -89,6 +100,7 @@ def test_edge_initialization(): assert len(edge.status) == 2 assert np.all(edge.status == 1) + def test_edge_invalid_dimension(): """Test that initializing an Edge with a non-positive dimension raises an error.""" node1 = Node("node1") @@ -96,6 +108,7 @@ def test_edge_invalid_dimension(): with pytest.raises(ValueError, match="Dimensions must be a positive integer."): Edge(node1, node2, dimension=0) + def test_is_edge_alive_in_dimension(): """Test checking edge's alive status in a specific dimension.""" node1 = Node("node1") @@ -105,6 +118,7 @@ def test_is_edge_alive_in_dimension(): edge.status[1] = 0 assert not edge.is_edge_alive_in_dimension(1) + def test_edge_alive_invalid_dimension(): """Test that checking alive status with an invalid dimension raises an error.""" node1 = Node("node1") @@ -113,6 +127,7 @@ def test_edge_alive_invalid_dimension(): with pytest.raises(ValueError, match="Dimension 1 is out of range"): edge.is_edge_alive_in_dimension(1) + def test_edge_equality_directed(): """Test equality between directed edges.""" node1 = Node("node1") @@ -121,6 +136,7 @@ def test_edge_equality_directed(): edge2 = Edge(node1, node2, directed=True) assert edge1 == edge2 + def test_edge_equality_undirected(): """Test equality between undirected edges.""" node1 = Node("node1") @@ -129,6 +145,7 @@ def test_edge_equality_undirected(): edge2 = Edge(node2, node1, directed=False) assert edge1 == edge2 + def test_edge_hash_directed(): """Test hashing for directed edges.""" node1 = Node("node1") @@ -136,9 +153,10 @@ def test_edge_hash_directed(): edge = Edge(node1, node2, directed=True) assert hash(edge) == hash((node1, node2)) + def test_edge_hash_undirected(): """Test hashing for undirected edges.""" node1 = Node("node1") node2 = Node("node2") edge = Edge(node1, node2, directed=False) - assert hash(edge) == hash(frozenset({node1, node2})) \ No newline at end of file + assert hash(edge) == hash(frozenset({node1, node2})) diff --git a/cognee/tests/unit/modules/graph/cognee_graph_test.py b/cognee/tests/unit/modules/graph/cognee_graph_test.py index 235ccf11d..d05292d75 100644 --- a/cognee/tests/unit/modules/graph/cognee_graph_test.py +++ b/cognee/tests/unit/modules/graph/cognee_graph_test.py @@ -1,7 +1,7 @@ import pytest -from cognee.modules.graph.cognee_graph.CogneeGraphElements import Node, Edge from cognee.modules.graph.cognee_graph.CogneeGraph import CogneeGraph +from cognee.modules.graph.cognee_graph.CogneeGraphElements import Edge, Node @pytest.fixture @@ -9,6 +9,7 @@ def setup_graph(): """Fixture to initialize a CogneeGraph instance.""" return CogneeGraph() + def test_add_node_success(setup_graph): """Test successful addition of a node.""" graph = setup_graph @@ -16,6 +17,7 @@ def test_add_node_success(setup_graph): graph.add_node(node) assert graph.get_node("node1") == node + def test_add_duplicate_node(setup_graph): """Test adding a duplicate node raises an exception.""" graph = setup_graph @@ -24,6 +26,7 @@ def test_add_duplicate_node(setup_graph): with pytest.raises(ValueError, match="Node with id node1 already exists."): graph.add_node(node) + def test_add_edge_success(setup_graph): """Test successful addition of an edge.""" graph = setup_graph @@ -37,6 +40,7 @@ def test_add_edge_success(setup_graph): assert edge in node1.skeleton_edges assert edge in node2.skeleton_edges + def test_add_duplicate_edge(setup_graph): """Test adding a duplicate edge raises an exception.""" graph = setup_graph @@ -49,6 +53,7 @@ def test_add_duplicate_edge(setup_graph): with pytest.raises(ValueError, match="Edge .* already exists in the graph."): graph.add_edge(edge) + def test_get_node_success(setup_graph): """Test retrieving an existing node.""" graph = setup_graph @@ -56,11 +61,13 @@ def test_get_node_success(setup_graph): graph.add_node(node) assert graph.get_node("node1") == node + def test_get_node_nonexistent(setup_graph): """Test retrieving a nonexistent node returns None.""" graph = setup_graph assert graph.get_node("nonexistent") is None + def test_get_edges_success(setup_graph): """Test retrieving edges of a node.""" graph = setup_graph @@ -72,6 +79,7 @@ def test_get_edges_success(setup_graph): graph.add_edge(edge) assert edge in graph.get_edges("node1") + def test_get_edges_nonexistent_node(setup_graph): """Test retrieving edges for a nonexistent node raises an exception.""" graph = setup_graph diff --git a/cognee/tests/unit/processing/chunks/chunk_by_paragraph_2_test.py b/cognee/tests/unit/processing/chunks/chunk_by_paragraph_2_test.py index ad09c9671..8e900727d 100644 --- a/cognee/tests/unit/processing/chunks/chunk_by_paragraph_2_test.py +++ b/cognee/tests/unit/processing/chunks/chunk_by_paragraph_2_test.py @@ -37,3 +37,17 @@ def test_paragraph_chunk_length(input_text, paragraph_length, batch_paragraphs): assert np.all( chunk_lengths <= paragraph_length ), f"{paragraph_length = }: {larger_chunks} are too large" + + +@pytest.mark.parametrize( + "input_text,paragraph_length,batch_paragraphs", + list(product(list(INPUT_TEXTS.values()), paragraph_lengths, batch_paragraphs_vals)), +) +def test_chunk_by_paragraph_chunk_numbering( + input_text, paragraph_length, batch_paragraphs +): + chunks = chunk_by_paragraph(input_text, paragraph_length, batch_paragraphs) + chunk_indices = np.array([chunk["chunk_index"] for chunk in chunks]) + assert np.all( + chunk_indices == np.arange(len(chunk_indices)) + ), f"{chunk_indices = } are not monotonically increasing" From 14dd60576ed080507711d94ea872bb158a7e86ed Mon Sep 17 00:00:00 2001 From: Leon Luithlen Date: Thu, 14 Nov 2024 17:06:16 +0100 Subject: [PATCH 37/39] Fix indexing in tests in chunk_by_sentence_test --- cognee/tests/unit/processing/chunks/chunk_by_sentence_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cognee/tests/unit/processing/chunks/chunk_by_sentence_test.py b/cognee/tests/unit/processing/chunks/chunk_by_sentence_test.py index 2f42f836a..d1c75d7ed 100644 --- a/cognee/tests/unit/processing/chunks/chunk_by_sentence_test.py +++ b/cognee/tests/unit/processing/chunks/chunk_by_sentence_test.py @@ -15,7 +15,7 @@ maximum_length_vals = [None, 8, 64] ) def test_chunk_by_sentence_isomorphism(input_text, maximum_length): chunks = chunk_by_sentence(input_text, maximum_length) - reconstructed_text = "".join([chunk[2] for chunk in chunks]) + reconstructed_text = "".join([chunk[1] for chunk in chunks]) assert ( reconstructed_text == input_text ), f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }" @@ -33,7 +33,7 @@ def test_chunk_by_sentence_isomorphism(input_text, maximum_length): def test_paragraph_chunk_length(input_text, maximum_length): chunks = list(chunk_by_sentence(input_text, maximum_length)) - chunk_lengths = np.array([len(list(chunk_by_word(chunk[2]))) for chunk in chunks]) + chunk_lengths = np.array([len(list(chunk_by_word(chunk[1]))) for chunk in chunks]) larger_chunks = chunk_lengths[chunk_lengths > maximum_length] assert np.all( From e40e7386a0f61dec7c6aa6dac1fc4a4ab5f3c9cf Mon Sep 17 00:00:00 2001 From: Leon Luithlen Date: Thu, 14 Nov 2024 17:16:04 +0100 Subject: [PATCH 38/39] Refactor word_type yielding in chuck_by_sentence --- cognee/tasks/chunks/chunk_by_sentence.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/cognee/tasks/chunks/chunk_by_sentence.py b/cognee/tasks/chunks/chunk_by_sentence.py index bee074d04..c6848f066 100644 --- a/cognee/tasks/chunks/chunk_by_sentence.py +++ b/cognee/tasks/chunks/chunk_by_sentence.py @@ -10,29 +10,32 @@ def chunk_by_sentence(data: str, maximum_length: Optional[int] = None): paragraph_id = uuid4() word_count = 0 section_end = False + word_type_state = None + # the yielded word_type_state is identical to word_type, except when + # the word type is 'word', the word doesn't contain any letters + # and words with the same characteristics connect it to a preceding + # word with word_type 'paragraph_end' or 'sentence_end' for (word, word_type) in chunk_by_word(data): sentence += word word_count += 1 - # this loop is to check if any letters come after a paragraph_end or sentence_end - # and if that is not the case, preserve the word_type for the final yield in the - # function if word_type in ["paragraph_end", "sentence_end"]: - section_end = word_type + word_type_state = word_type else: for character in word: if character.isalpha(): - section_end = "sentence_cut" + word_type_state = word_type break if word_type in ["paragraph_end", "sentence_end"] or (maximum_length and (word_count == maximum_length)): - yield (paragraph_id, sentence, word_count, word_type) + yield (paragraph_id, sentence, word_count, word_type_state) sentence = "" word_count = 0 paragraph_id = uuid4() if word_type == "paragraph_end" else paragraph_id if len(sentence) > 0: + section_end = "sentence_cut" if word_type_state == "word" else word_type_state yield ( paragraph_id, sentence, From f51a44fd76bc1603b30c1c6a50130ef7ff6f80ba Mon Sep 17 00:00:00 2001 From: Leon Luithlen Date: Thu, 14 Nov 2024 17:18:36 +0100 Subject: [PATCH 39/39] Remove unneeded document.read in AudioDocument_test --- cognee/tests/integration/documents/AudioDocument_test.py | 1 - 1 file changed, 1 deletion(-) diff --git a/cognee/tests/integration/documents/AudioDocument_test.py b/cognee/tests/integration/documents/AudioDocument_test.py index f4df19849..f133ef811 100644 --- a/cognee/tests/integration/documents/AudioDocument_test.py +++ b/cognee/tests/integration/documents/AudioDocument_test.py @@ -30,7 +30,6 @@ def test_AudioDocument(): id=uuid.uuid4(), name="audio-dummy-test", raw_data_location="" ) with patch.object(AudioDocument, "create_transcript", return_value=TEST_TEXT): - l = list(document.read(chunk_size=64)) for ground_truth, paragraph_data in zip( GROUND_TRUTH, document.read(chunk_size=64) ):