From 98cbaaff685b3e2e8db45c016bca2fabc1cb4589 Mon Sep 17 00:00:00 2001 From: Leon Luithlen Date: Wed, 13 Nov 2024 11:30:42 +0100 Subject: [PATCH] Add isomorphism tests --- .../chunks/chunk_by_paragraph_test2.py | 17 ++ .../chunks/chunk_by_sentence_test.py | 17 ++ .../processing/chunks/chunk_by_word_test.py | 26 ++ .../unit/processing/chunks/test_input.py | 275 ++++++++++++++++++ 4 files changed, 335 insertions(+) create mode 100644 cognee/tests/unit/processing/chunks/chunk_by_paragraph_test2.py create mode 100644 cognee/tests/unit/processing/chunks/chunk_by_sentence_test.py create mode 100644 cognee/tests/unit/processing/chunks/chunk_by_word_test.py create mode 100644 cognee/tests/unit/processing/chunks/test_input.py diff --git a/cognee/tests/unit/processing/chunks/chunk_by_paragraph_test2.py b/cognee/tests/unit/processing/chunks/chunk_by_paragraph_test2.py new file mode 100644 index 000000000..2cb95f416 --- /dev/null +++ b/cognee/tests/unit/processing/chunks/chunk_by_paragraph_test2.py @@ -0,0 +1,17 @@ +import pytest +import numpy as np +from cognee.tasks.chunks import chunk_by_paragraph +from cognee.tests.unit.processing.chunks.test_input import INPUT_TEXTS + +@pytest.mark.parametrize("input_text", [ + INPUT_TEXTS["english_text"], + INPUT_TEXTS["english_lists"], + INPUT_TEXTS["python_code"], + INPUT_TEXTS["chinese_text"] +]) + +def test_chunk_by_paragraph_isomorphism(input_text): + chunks = chunk_by_paragraph(input_text) + reconstructed_text = "".join([chunk["text"] for chunk in chunks]) + assert reconstructed_text == input_text, f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }" + diff --git a/cognee/tests/unit/processing/chunks/chunk_by_sentence_test.py b/cognee/tests/unit/processing/chunks/chunk_by_sentence_test.py new file mode 100644 index 000000000..a21a3e9f9 --- /dev/null +++ b/cognee/tests/unit/processing/chunks/chunk_by_sentence_test.py @@ -0,0 +1,17 @@ +import pytest +import numpy as np +from cognee.tasks.chunks import chunk_by_sentence +from cognee.tests.unit.processing.chunks.test_input import INPUT_TEXTS + +@pytest.mark.parametrize("input_text", [ + INPUT_TEXTS["english_text"], + INPUT_TEXTS["english_lists"], + INPUT_TEXTS["python_code"], + INPUT_TEXTS["chinese_text"] +]) + +def test_chunk_by_sentence_isomorphism(input_text): + chunks = chunk_by_sentence(input_text) + reconstructed_text = "".join([chunk[2] for chunk in chunks]) + assert reconstructed_text == input_text, f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }" + diff --git a/cognee/tests/unit/processing/chunks/chunk_by_word_test.py b/cognee/tests/unit/processing/chunks/chunk_by_word_test.py new file mode 100644 index 000000000..54e19b162 --- /dev/null +++ b/cognee/tests/unit/processing/chunks/chunk_by_word_test.py @@ -0,0 +1,26 @@ +import pytest +import numpy as np +from cognee.tasks.chunks import chunk_by_word +from cognee.tests.unit.processing.chunks.test_input import INPUT_TEXTS + +@pytest.mark.parametrize("input_text", [ + INPUT_TEXTS["english_text"], + INPUT_TEXTS["english_lists"], + INPUT_TEXTS["python_code"], + INPUT_TEXTS["chinese_text"] +]) + +def test_chunk_by_word_isomorphism(input_text): + chunks = chunk_by_word(input_text) + reconstructed_text = "".join([chunk[0] for chunk in chunks]) + assert reconstructed_text == input_text, f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }" + +def test_chunk_by_word_splits(input_text): + chunks = np.array(list(chunk_by_word(input_text))) + space_test = np.array([" " not in chunk[0].strip() for chunk in chunks]) + + assert np.all(space_test), f"These chunks contain spaces within them: {chunks[space_test == False]}" + + + + diff --git a/cognee/tests/unit/processing/chunks/test_input.py b/cognee/tests/unit/processing/chunks/test_input.py new file mode 100644 index 000000000..ad6603d9d --- /dev/null +++ b/cognee/tests/unit/processing/chunks/test_input.py @@ -0,0 +1,275 @@ +import pytest + +INPUT_TEXTS = { + "english_lists": """Let me think through the key attributes that would be important to test in a text chunking system. +Here are the essential attributes to test: + +Chunking Boundaries Accuracy: + + +Proper sentence boundary detection +Handling of punctuation marks +Recognition of paragraph breaks +Treatment of special characters and whitespace +Proper handling of quotes and nested text structures + + +Language Support: + + +Handling of different languages and scripts +Support for multilingual documents +Proper Unicode handling +Treatment of language-specific punctuation + + +Special Cases Handling: + + +Lists and bullet points +Tables and structured content +Code blocks or technical content +Citations and references +Headers and footers +URLs and email addresses + + +Performance Metrics: + + +Processing speed for different text lengths +Memory usage with large documents +Scalability with increasing document size +Consistency across multiple runs + + +Document Format Support: + + +Plain text handling +HTML/XML content +PDF text extraction +Markdown formatting +Mixed format documents + + +Error Handling: + + +Malformed input text +Incomplete sentences +Truncated documents +Invalid characters +Missing punctuation + + +Configuration Flexibility: + + +Adjustable chunk sizes +Customizable boundary rules +Configurable overlap between chunks +Token vs. character-based chunking options + + +Preservation of Context: + + +Maintaining semantic coherence +Preserving contextual relationships +Handling cross-references +Maintaining document structure + +Would you like me to elaborate on any of these attributes or discuss specific testing strategies for them?""", + "python_code": """from typing import ( + Literal as L, + Any, + TypeAlias, + overload, + TypeVar, + Protocol, + type_check_only, +) + +from numpy import generic + +from numpy._typing import ( + ArrayLike, + NDArray, + _ArrayLikeInt, + _ArrayLike, +) + +__all__ = ["pad"] + +_SCT = TypeVar("_SCT", bound=generic) + +@type_check_only +class _ModeFunc(Protocol): + def __call__( + self, + vector: NDArray[Any], + iaxis_pad_width: tuple[int, int], + iaxis: int, + kwargs: dict[str, Any], + /, + ) -> None: ... + +_ModeKind: TypeAlias = L[ + "constant", + "edge", + "linear_ramp", + "maximum", + "mean", + "median", + "minimum", + "reflect", + "symmetric", + "wrap", + "empty", +] + + +# TODO: In practice each keyword argument is exclusive to one or more +# specific modes. Consider adding more overloads to express this in the future. + +# Expand `**kwargs` into explicit keyword-only arguments +@overload +def pad( + array: _ArrayLike[_SCT], + pad_width: _ArrayLikeInt, + mode: _ModeKind = ..., + *, + stat_length: None | _ArrayLikeInt = ..., + constant_values: ArrayLike = ..., + end_values: ArrayLike = ..., + reflect_type: L["odd", "even"] = ..., +) -> NDArray[_SCT]: ... +@overload +def pad( + array: ArrayLike, + pad_width: _ArrayLikeInt, + mode: _ModeKind = ..., + *, + stat_length: None | _ArrayLikeInt = ..., + constant_values: ArrayLike = ..., + end_values: ArrayLike = ..., + reflect_type: L["odd", "even"] = ..., +) -> NDArray[Any]: ... +@overload +def pad( + array: _ArrayLike[_SCT], + pad_width: _ArrayLikeInt, + mode: _ModeFunc, + **kwargs: Any, +) -> NDArray[_SCT]: ... +@overload +def pad( + array: ArrayLike, + pad_width: _ArrayLikeInt, + mode: _ModeFunc, + **kwargs: Any, +) -> NDArray[Any]: ...""", + "chinese_text": """在这个繁华的城市里,藏着一个古老的小巷,名叫杨柳巷。巷子两旁的青石板路已经被无数行人的脚步磨得发亮,斑驳的老墙上爬满了常青藤,给这个充满历史气息的小巷增添了一抹生机。每天清晨,巷子里都会飘出阵阵香气,那是张婆婆家的早点铺子散发出的包子和豆浆的味道。老店门前经常排着长队,有步履匆匆的上班族,也有悠闲散步的老人。巷子深处有一家传统的茶馆,古色古香的木桌椅上总是坐满了品茶聊天的街坊邻里。傍晚时分,夕阳的余晖洒在石板路上,为这个充满生活气息的小巷染上一层温暖的金色。街角的老榕树下,常常有卖唱的艺人在这里驻足,用沧桑的嗓音讲述着这座城市的故事。偶尔,还能看到三三两两的游客举着相机,试图捕捉这里独特的市井风情。这条看似普通的小巷,承载着太多市民的回忆和岁月的痕迹,它就像是这座城市的一个缩影,悄悄地诉说着曾经的故事。""", + "english_text": """O for that warning voice, which he who saw +Th' Apocalyps, heard cry in Heaven aloud, +Then when the Dragon, put to second rout, +Came furious down to be reveng'd on men, +Wo to the inhabitants on Earth! that now, [ 5 ] +While time was, our first-Parents had bin warnd +The coming of thir secret foe, and scap'd +Haply so scap'd his mortal snare; for now +Satan, now first inflam'd with rage, came down, +The Tempter ere th' Accuser of man-kind, [ 10 ] +To wreck on innocent frail man his loss +Of that first Battel, and his flight to Hell: +Yet not rejoycing in his speed, though bold, +Far off and fearless, nor with cause to boast, +Begins his dire attempt, which nigh the birth [ 15 ] +Now rowling, boiles in his tumultuous brest, +And like a devillish Engine back recoiles +Upon himself; horror and doubt distract +His troubl'd thoughts, and from the bottom stirr +The Hell within him, for within him Hell [ 20 ] +He brings, and round about him, nor from Hell +One step no more then from himself can fly +By change of place: Now conscience wakes despair +That slumberd, wakes the bitter memorie +Of what he was, what is, and what must be [ 25 ] +Worse; of worse deeds worse sufferings must ensue. +Sometimes towards Eden which now in his view +Lay pleasant, his grievd look he fixes sad, +Sometimes towards Heav'n and the full-blazing Sun, +Which now sat high in his Meridian Towre: [ 30 ] +Then much revolving, thus in sighs began. + +O thou that with surpassing Glory crownd, +Look'st from thy sole Dominion like the God +Of this new World; at whose sight all the Starrs +Hide thir diminisht heads; to thee I call, [ 35 ] +But with no friendly voice, and add thy name +O Sun, to tell thee how I hate thy beams +That bring to my remembrance from what state +I fell, how glorious once above thy Spheare; +Till Pride and worse Ambition threw me down [ 40 ] +Warring in Heav'n against Heav'ns matchless King: +Ah wherefore! he deservd no such return +From me, whom he created what I was +In that bright eminence, and with his good +Upbraided none; nor was his service hard. [ 45 ] +What could be less then to afford him praise, +The easiest recompence, and pay him thanks, +How due! yet all his good prov'd ill in me, +And wrought but malice; lifted up so high +I sdeind subjection, and thought one step higher [ 50 ] +Would set me highest, and in a moment quit +The debt immense of endless gratitude, +So burthensome, still paying, still to ow; +Forgetful what from him I still receivd, +And understood not that a grateful mind [ 55 ] +By owing owes not, but still pays, at once +Indebted and dischargd; what burden then? +O had his powerful Destiny ordaind +Me some inferiour Angel, I had stood +Then happie; no unbounded hope had rais'd [ 60 ] +Ambition. Yet why not? som other Power +As great might have aspir'd, and me though mean +Drawn to his part; but other Powers as great +Fell not, but stand unshak'n, from within +Or from without, to all temptations arm'd. [ 65 ] +Hadst thou the same free Will and Power to stand? +Thou hadst: whom hast thou then or what to accuse, +But Heav'ns free Love dealt equally to all? +Be then his Love accurst, since love or hate, +To me alike, it deals eternal woe. [ 70 ] +Nay curs'd be thou; since against his thy will +Chose freely what it now so justly rues. +Me miserable! which way shall I flie +Infinite wrauth, and infinite despaire? +Which way I flie is Hell; my self am Hell; [ 75 ] +And in the lowest deep a lower deep +Still threatning to devour me opens wide, +To which the Hell I suffer seems a Heav'n. +O then at last relent: is there no place +Left for Repentance, none for Pardon left? [ 80 ] +None left but by submission; and that word +Disdain forbids me, and my dread of shame +Among the Spirits beneath, whom I seduc'd +With other promises and other vaunts +Then to submit, boasting I could subdue [ 85 ] +Th' Omnipotent. Ay me, they little know +How dearly I abide that boast so vaine, +Under what torments inwardly I groane: +While they adore me on the Throne of Hell, +With Diadem and Sceptre high advanc'd [ 90 ] +The lower still I fall, onely Supream +In miserie; such joy Ambition findes. +But say I could repent and could obtaine +By Act of Grace my former state; how soon +Would higth recall high thoughts, how soon unsay [ 95 ] +What feign'd submission swore: ease would recant +Vows made in pain, as violent and void. +For never can true reconcilement grow +Where wounds of deadly hate have peirc'd so deep: +Which would but lead me to a worse relapse [ 100 ]""" +} \ No newline at end of file