Add isomorphism tests
This commit is contained in:
parent
6f0637a028
commit
98cbaaff68
4 changed files with 335 additions and 0 deletions
|
|
@ -0,0 +1,17 @@
|
||||||
|
import pytest
|
||||||
|
import numpy as np
|
||||||
|
from cognee.tasks.chunks import chunk_by_paragraph
|
||||||
|
from cognee.tests.unit.processing.chunks.test_input import INPUT_TEXTS
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("input_text", [
|
||||||
|
INPUT_TEXTS["english_text"],
|
||||||
|
INPUT_TEXTS["english_lists"],
|
||||||
|
INPUT_TEXTS["python_code"],
|
||||||
|
INPUT_TEXTS["chinese_text"]
|
||||||
|
])
|
||||||
|
|
||||||
|
def test_chunk_by_paragraph_isomorphism(input_text):
|
||||||
|
chunks = chunk_by_paragraph(input_text)
|
||||||
|
reconstructed_text = "".join([chunk["text"] for chunk in chunks])
|
||||||
|
assert reconstructed_text == input_text, f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }"
|
||||||
|
|
||||||
|
|
@ -0,0 +1,17 @@
|
||||||
|
import pytest
|
||||||
|
import numpy as np
|
||||||
|
from cognee.tasks.chunks import chunk_by_sentence
|
||||||
|
from cognee.tests.unit.processing.chunks.test_input import INPUT_TEXTS
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("input_text", [
|
||||||
|
INPUT_TEXTS["english_text"],
|
||||||
|
INPUT_TEXTS["english_lists"],
|
||||||
|
INPUT_TEXTS["python_code"],
|
||||||
|
INPUT_TEXTS["chinese_text"]
|
||||||
|
])
|
||||||
|
|
||||||
|
def test_chunk_by_sentence_isomorphism(input_text):
|
||||||
|
chunks = chunk_by_sentence(input_text)
|
||||||
|
reconstructed_text = "".join([chunk[2] for chunk in chunks])
|
||||||
|
assert reconstructed_text == input_text, f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }"
|
||||||
|
|
||||||
26
cognee/tests/unit/processing/chunks/chunk_by_word_test.py
Normal file
26
cognee/tests/unit/processing/chunks/chunk_by_word_test.py
Normal file
|
|
@ -0,0 +1,26 @@
|
||||||
|
import pytest
|
||||||
|
import numpy as np
|
||||||
|
from cognee.tasks.chunks import chunk_by_word
|
||||||
|
from cognee.tests.unit.processing.chunks.test_input import INPUT_TEXTS
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("input_text", [
|
||||||
|
INPUT_TEXTS["english_text"],
|
||||||
|
INPUT_TEXTS["english_lists"],
|
||||||
|
INPUT_TEXTS["python_code"],
|
||||||
|
INPUT_TEXTS["chinese_text"]
|
||||||
|
])
|
||||||
|
|
||||||
|
def test_chunk_by_word_isomorphism(input_text):
|
||||||
|
chunks = chunk_by_word(input_text)
|
||||||
|
reconstructed_text = "".join([chunk[0] for chunk in chunks])
|
||||||
|
assert reconstructed_text == input_text, f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }"
|
||||||
|
|
||||||
|
def test_chunk_by_word_splits(input_text):
|
||||||
|
chunks = np.array(list(chunk_by_word(input_text)))
|
||||||
|
space_test = np.array([" " not in chunk[0].strip() for chunk in chunks])
|
||||||
|
|
||||||
|
assert np.all(space_test), f"These chunks contain spaces within them: {chunks[space_test == False]}"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
275
cognee/tests/unit/processing/chunks/test_input.py
Normal file
275
cognee/tests/unit/processing/chunks/test_input.py
Normal file
|
|
@ -0,0 +1,275 @@
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
INPUT_TEXTS = {
|
||||||
|
"english_lists": """Let me think through the key attributes that would be important to test in a text chunking system.
|
||||||
|
Here are the essential attributes to test:
|
||||||
|
|
||||||
|
Chunking Boundaries Accuracy:
|
||||||
|
|
||||||
|
|
||||||
|
Proper sentence boundary detection
|
||||||
|
Handling of punctuation marks
|
||||||
|
Recognition of paragraph breaks
|
||||||
|
Treatment of special characters and whitespace
|
||||||
|
Proper handling of quotes and nested text structures
|
||||||
|
|
||||||
|
|
||||||
|
Language Support:
|
||||||
|
|
||||||
|
|
||||||
|
Handling of different languages and scripts
|
||||||
|
Support for multilingual documents
|
||||||
|
Proper Unicode handling
|
||||||
|
Treatment of language-specific punctuation
|
||||||
|
|
||||||
|
|
||||||
|
Special Cases Handling:
|
||||||
|
|
||||||
|
|
||||||
|
Lists and bullet points
|
||||||
|
Tables and structured content
|
||||||
|
Code blocks or technical content
|
||||||
|
Citations and references
|
||||||
|
Headers and footers
|
||||||
|
URLs and email addresses
|
||||||
|
|
||||||
|
|
||||||
|
Performance Metrics:
|
||||||
|
|
||||||
|
|
||||||
|
Processing speed for different text lengths
|
||||||
|
Memory usage with large documents
|
||||||
|
Scalability with increasing document size
|
||||||
|
Consistency across multiple runs
|
||||||
|
|
||||||
|
|
||||||
|
Document Format Support:
|
||||||
|
|
||||||
|
|
||||||
|
Plain text handling
|
||||||
|
HTML/XML content
|
||||||
|
PDF text extraction
|
||||||
|
Markdown formatting
|
||||||
|
Mixed format documents
|
||||||
|
|
||||||
|
|
||||||
|
Error Handling:
|
||||||
|
|
||||||
|
|
||||||
|
Malformed input text
|
||||||
|
Incomplete sentences
|
||||||
|
Truncated documents
|
||||||
|
Invalid characters
|
||||||
|
Missing punctuation
|
||||||
|
|
||||||
|
|
||||||
|
Configuration Flexibility:
|
||||||
|
|
||||||
|
|
||||||
|
Adjustable chunk sizes
|
||||||
|
Customizable boundary rules
|
||||||
|
Configurable overlap between chunks
|
||||||
|
Token vs. character-based chunking options
|
||||||
|
|
||||||
|
|
||||||
|
Preservation of Context:
|
||||||
|
|
||||||
|
|
||||||
|
Maintaining semantic coherence
|
||||||
|
Preserving contextual relationships
|
||||||
|
Handling cross-references
|
||||||
|
Maintaining document structure
|
||||||
|
|
||||||
|
Would you like me to elaborate on any of these attributes or discuss specific testing strategies for them?""",
|
||||||
|
"python_code": """from typing import (
|
||||||
|
Literal as L,
|
||||||
|
Any,
|
||||||
|
TypeAlias,
|
||||||
|
overload,
|
||||||
|
TypeVar,
|
||||||
|
Protocol,
|
||||||
|
type_check_only,
|
||||||
|
)
|
||||||
|
|
||||||
|
from numpy import generic
|
||||||
|
|
||||||
|
from numpy._typing import (
|
||||||
|
ArrayLike,
|
||||||
|
NDArray,
|
||||||
|
_ArrayLikeInt,
|
||||||
|
_ArrayLike,
|
||||||
|
)
|
||||||
|
|
||||||
|
__all__ = ["pad"]
|
||||||
|
|
||||||
|
_SCT = TypeVar("_SCT", bound=generic)
|
||||||
|
|
||||||
|
@type_check_only
|
||||||
|
class _ModeFunc(Protocol):
|
||||||
|
def __call__(
|
||||||
|
self,
|
||||||
|
vector: NDArray[Any],
|
||||||
|
iaxis_pad_width: tuple[int, int],
|
||||||
|
iaxis: int,
|
||||||
|
kwargs: dict[str, Any],
|
||||||
|
/,
|
||||||
|
) -> None: ...
|
||||||
|
|
||||||
|
_ModeKind: TypeAlias = L[
|
||||||
|
"constant",
|
||||||
|
"edge",
|
||||||
|
"linear_ramp",
|
||||||
|
"maximum",
|
||||||
|
"mean",
|
||||||
|
"median",
|
||||||
|
"minimum",
|
||||||
|
"reflect",
|
||||||
|
"symmetric",
|
||||||
|
"wrap",
|
||||||
|
"empty",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
# TODO: In practice each keyword argument is exclusive to one or more
|
||||||
|
# specific modes. Consider adding more overloads to express this in the future.
|
||||||
|
|
||||||
|
# Expand `**kwargs` into explicit keyword-only arguments
|
||||||
|
@overload
|
||||||
|
def pad(
|
||||||
|
array: _ArrayLike[_SCT],
|
||||||
|
pad_width: _ArrayLikeInt,
|
||||||
|
mode: _ModeKind = ...,
|
||||||
|
*,
|
||||||
|
stat_length: None | _ArrayLikeInt = ...,
|
||||||
|
constant_values: ArrayLike = ...,
|
||||||
|
end_values: ArrayLike = ...,
|
||||||
|
reflect_type: L["odd", "even"] = ...,
|
||||||
|
) -> NDArray[_SCT]: ...
|
||||||
|
@overload
|
||||||
|
def pad(
|
||||||
|
array: ArrayLike,
|
||||||
|
pad_width: _ArrayLikeInt,
|
||||||
|
mode: _ModeKind = ...,
|
||||||
|
*,
|
||||||
|
stat_length: None | _ArrayLikeInt = ...,
|
||||||
|
constant_values: ArrayLike = ...,
|
||||||
|
end_values: ArrayLike = ...,
|
||||||
|
reflect_type: L["odd", "even"] = ...,
|
||||||
|
) -> NDArray[Any]: ...
|
||||||
|
@overload
|
||||||
|
def pad(
|
||||||
|
array: _ArrayLike[_SCT],
|
||||||
|
pad_width: _ArrayLikeInt,
|
||||||
|
mode: _ModeFunc,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> NDArray[_SCT]: ...
|
||||||
|
@overload
|
||||||
|
def pad(
|
||||||
|
array: ArrayLike,
|
||||||
|
pad_width: _ArrayLikeInt,
|
||||||
|
mode: _ModeFunc,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> NDArray[Any]: ...""",
|
||||||
|
"chinese_text": """在这个繁华的城市里,藏着一个古老的小巷,名叫杨柳巷。巷子两旁的青石板路已经被无数行人的脚步磨得发亮,斑驳的老墙上爬满了常青藤,给这个充满历史气息的小巷增添了一抹生机。每天清晨,巷子里都会飘出阵阵香气,那是张婆婆家的早点铺子散发出的包子和豆浆的味道。老店门前经常排着长队,有步履匆匆的上班族,也有悠闲散步的老人。巷子深处有一家传统的茶馆,古色古香的木桌椅上总是坐满了品茶聊天的街坊邻里。傍晚时分,夕阳的余晖洒在石板路上,为这个充满生活气息的小巷染上一层温暖的金色。街角的老榕树下,常常有卖唱的艺人在这里驻足,用沧桑的嗓音讲述着这座城市的故事。偶尔,还能看到三三两两的游客举着相机,试图捕捉这里独特的市井风情。这条看似普通的小巷,承载着太多市民的回忆和岁月的痕迹,它就像是这座城市的一个缩影,悄悄地诉说着曾经的故事。""",
|
||||||
|
"english_text": """O for that warning voice, which he who saw
|
||||||
|
Th' Apocalyps, heard cry in Heaven aloud,
|
||||||
|
Then when the Dragon, put to second rout,
|
||||||
|
Came furious down to be reveng'd on men,
|
||||||
|
Wo to the inhabitants on Earth! that now, [ 5 ]
|
||||||
|
While time was, our first-Parents had bin warnd
|
||||||
|
The coming of thir secret foe, and scap'd
|
||||||
|
Haply so scap'd his mortal snare; for now
|
||||||
|
Satan, now first inflam'd with rage, came down,
|
||||||
|
The Tempter ere th' Accuser of man-kind, [ 10 ]
|
||||||
|
To wreck on innocent frail man his loss
|
||||||
|
Of that first Battel, and his flight to Hell:
|
||||||
|
Yet not rejoycing in his speed, though bold,
|
||||||
|
Far off and fearless, nor with cause to boast,
|
||||||
|
Begins his dire attempt, which nigh the birth [ 15 ]
|
||||||
|
Now rowling, boiles in his tumultuous brest,
|
||||||
|
And like a devillish Engine back recoiles
|
||||||
|
Upon himself; horror and doubt distract
|
||||||
|
His troubl'd thoughts, and from the bottom stirr
|
||||||
|
The Hell within him, for within him Hell [ 20 ]
|
||||||
|
He brings, and round about him, nor from Hell
|
||||||
|
One step no more then from himself can fly
|
||||||
|
By change of place: Now conscience wakes despair
|
||||||
|
That slumberd, wakes the bitter memorie
|
||||||
|
Of what he was, what is, and what must be [ 25 ]
|
||||||
|
Worse; of worse deeds worse sufferings must ensue.
|
||||||
|
Sometimes towards Eden which now in his view
|
||||||
|
Lay pleasant, his grievd look he fixes sad,
|
||||||
|
Sometimes towards Heav'n and the full-blazing Sun,
|
||||||
|
Which now sat high in his Meridian Towre: [ 30 ]
|
||||||
|
Then much revolving, thus in sighs began.
|
||||||
|
|
||||||
|
O thou that with surpassing Glory crownd,
|
||||||
|
Look'st from thy sole Dominion like the God
|
||||||
|
Of this new World; at whose sight all the Starrs
|
||||||
|
Hide thir diminisht heads; to thee I call, [ 35 ]
|
||||||
|
But with no friendly voice, and add thy name
|
||||||
|
O Sun, to tell thee how I hate thy beams
|
||||||
|
That bring to my remembrance from what state
|
||||||
|
I fell, how glorious once above thy Spheare;
|
||||||
|
Till Pride and worse Ambition threw me down [ 40 ]
|
||||||
|
Warring in Heav'n against Heav'ns matchless King:
|
||||||
|
Ah wherefore! he deservd no such return
|
||||||
|
From me, whom he created what I was
|
||||||
|
In that bright eminence, and with his good
|
||||||
|
Upbraided none; nor was his service hard. [ 45 ]
|
||||||
|
What could be less then to afford him praise,
|
||||||
|
The easiest recompence, and pay him thanks,
|
||||||
|
How due! yet all his good prov'd ill in me,
|
||||||
|
And wrought but malice; lifted up so high
|
||||||
|
I sdeind subjection, and thought one step higher [ 50 ]
|
||||||
|
Would set me highest, and in a moment quit
|
||||||
|
The debt immense of endless gratitude,
|
||||||
|
So burthensome, still paying, still to ow;
|
||||||
|
Forgetful what from him I still receivd,
|
||||||
|
And understood not that a grateful mind [ 55 ]
|
||||||
|
By owing owes not, but still pays, at once
|
||||||
|
Indebted and dischargd; what burden then?
|
||||||
|
O had his powerful Destiny ordaind
|
||||||
|
Me some inferiour Angel, I had stood
|
||||||
|
Then happie; no unbounded hope had rais'd [ 60 ]
|
||||||
|
Ambition. Yet why not? som other Power
|
||||||
|
As great might have aspir'd, and me though mean
|
||||||
|
Drawn to his part; but other Powers as great
|
||||||
|
Fell not, but stand unshak'n, from within
|
||||||
|
Or from without, to all temptations arm'd. [ 65 ]
|
||||||
|
Hadst thou the same free Will and Power to stand?
|
||||||
|
Thou hadst: whom hast thou then or what to accuse,
|
||||||
|
But Heav'ns free Love dealt equally to all?
|
||||||
|
Be then his Love accurst, since love or hate,
|
||||||
|
To me alike, it deals eternal woe. [ 70 ]
|
||||||
|
Nay curs'd be thou; since against his thy will
|
||||||
|
Chose freely what it now so justly rues.
|
||||||
|
Me miserable! which way shall I flie
|
||||||
|
Infinite wrauth, and infinite despaire?
|
||||||
|
Which way I flie is Hell; my self am Hell; [ 75 ]
|
||||||
|
And in the lowest deep a lower deep
|
||||||
|
Still threatning to devour me opens wide,
|
||||||
|
To which the Hell I suffer seems a Heav'n.
|
||||||
|
O then at last relent: is there no place
|
||||||
|
Left for Repentance, none for Pardon left? [ 80 ]
|
||||||
|
None left but by submission; and that word
|
||||||
|
Disdain forbids me, and my dread of shame
|
||||||
|
Among the Spirits beneath, whom I seduc'd
|
||||||
|
With other promises and other vaunts
|
||||||
|
Then to submit, boasting I could subdue [ 85 ]
|
||||||
|
Th' Omnipotent. Ay me, they little know
|
||||||
|
How dearly I abide that boast so vaine,
|
||||||
|
Under what torments inwardly I groane:
|
||||||
|
While they adore me on the Throne of Hell,
|
||||||
|
With Diadem and Sceptre high advanc'd [ 90 ]
|
||||||
|
The lower still I fall, onely Supream
|
||||||
|
In miserie; such joy Ambition findes.
|
||||||
|
But say I could repent and could obtaine
|
||||||
|
By Act of Grace my former state; how soon
|
||||||
|
Would higth recall high thoughts, how soon unsay [ 95 ]
|
||||||
|
What feign'd submission swore: ease would recant
|
||||||
|
Vows made in pain, as violent and void.
|
||||||
|
For never can true reconcilement grow
|
||||||
|
Where wounds of deadly hate have peirc'd so deep:
|
||||||
|
Which would but lead me to a worse relapse [ 100 ]"""
|
||||||
|
}
|
||||||
Loading…
Add table
Reference in a new issue