feat: Eliminate the use of max_chunk_tokens and use a unified max_chunk_size instead [cog-1381] (#626)

<!-- .github/pull_request_template.md -->

## Description
<!-- Provide a clear description of the changes in this PR -->

## DCO Affirmation
I affirm that all code in every commit of this pull request conforms to
the terms of the Topoteretes Developer Certificate of Origin


<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
## Summary by CodeRabbit

- **Refactor**
- Simplified text processing by unifying multiple size-related
parameters into a single metric across chunking and extraction
functionalities.
- Streamlined logic for text segmentation by removing redundant
calculations and checks, resulting in a more consistent chunk management
process.
- **Chores**
  - Removed the `modal` package as a dependency.
- **Documentation**
- Updated the README.md to include a new demo video link and clarified
default environment variable settings.
- Enhanced the CONTRIBUTING.md to improve clarity and engagement for
potential contributors.
- **Bug Fixes**
- Improved handling of sentence-ending punctuation in text processing to
include additional characters.
- **Version Update**
  - Updated project version to 0.1.33 in the pyproject.toml file.
<!-- end of auto-generated comment: release notes by coderabbit.ai -->
This commit is contained in:
alekszievr 2025-03-12 14:03:41 +01:00 committed by GitHub
parent b78d9f196f
commit c1f7b667d1
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
27 changed files with 285 additions and 219 deletions

View file

@ -113,7 +113,7 @@ def generate_dataset_name(dataset_name: str) -> str:
async def get_default_tasks( # TODO: Find out a better way to do this (Boris's comment) async def get_default_tasks( # TODO: Find out a better way to do this (Boris's comment)
user: User = None, graph_model: BaseModel = KnowledgeGraph, chunk_size=1024, chunker=TextChunker user: User = None, graph_model: BaseModel = KnowledgeGraph, chunker=TextChunker
) -> list[Task]: ) -> list[Task]:
if user is None: if user is None:
user = await get_default_user() user = await get_default_user()
@ -125,9 +125,8 @@ async def get_default_tasks( # TODO: Find out a better way to do this (Boris's
Task(check_permissions_on_documents, user=user, permissions=["write"]), Task(check_permissions_on_documents, user=user, permissions=["write"]),
Task( Task(
extract_chunks_from_documents, extract_chunks_from_documents,
max_chunk_tokens=get_max_chunk_tokens(), max_chunk_size=get_max_chunk_tokens(),
chunker=chunker, chunker=chunker,
chunk_size=chunk_size,
), # Extract text chunks based on the document type. ), # Extract text chunks based on the document type.
Task( Task(
extract_graph_from_data, graph_model=graph_model, task_config={"batch_size": 10} extract_graph_from_data, graph_model=graph_model, task_config={"batch_size": 10}

View file

@ -55,5 +55,5 @@ class CorpusBuilderExecutor:
await cognee.add(self.raw_corpus) await cognee.add(self.raw_corpus)
tasks = await self.task_getter(chunk_size=chunk_size, chunker=TextChunker) tasks = await self.task_getter(chunker=TextChunker)
await cognee.cognify(tasks=tasks) await cognee.cognify(tasks=tasks)

View file

@ -48,7 +48,6 @@ async def run_corpus_builder(params: dict, chunk_size=1024, chunker=TextChunker)
) )
questions = await corpus_builder.build_corpus( questions = await corpus_builder.build_corpus(
limit=params.get("number_of_samples_in_corpus"), limit=params.get("number_of_samples_in_corpus"),
chunk_size=chunk_size,
chunker=chunker, chunker=chunker,
load_golden_context=params.get("evaluating_contexts"), load_golden_context=params.get("evaluating_contexts"),
) )

View file

@ -1,13 +1,12 @@
class Chunker: class Chunker:
def __init__(self, document, get_text: callable, max_chunk_tokens: int, chunk_size: int = 1024): def __init__(self, document, get_text: callable, max_chunk_size: int):
self.chunk_index = 0 self.chunk_index = 0
self.chunk_size = 0 self.chunk_size = 0
self.token_count = 0 self.token_count = 0
self.document = document self.document = document
self.max_chunk_size = chunk_size self.max_chunk_size = max_chunk_size
self.get_text = get_text self.get_text = get_text
self.max_chunk_tokens = max_chunk_tokens
def read(self): def read(self):
raise NotImplementedError raise NotImplementedError

View file

@ -9,33 +9,23 @@ logger = logging.getLogger(__name__)
class TextChunker(Chunker): class TextChunker(Chunker):
def check_word_count_and_token_count(self, word_count_before, token_count_before, chunk_data):
word_count_fits = word_count_before + chunk_data["word_count"] <= self.max_chunk_size
token_count_fits = token_count_before + chunk_data["token_count"] <= self.max_chunk_tokens
return word_count_fits and token_count_fits
def read(self): def read(self):
paragraph_chunks = [] paragraph_chunks = []
for content_text in self.get_text(): for content_text in self.get_text():
for chunk_data in chunk_by_paragraph( for chunk_data in chunk_by_paragraph(
content_text, content_text,
self.max_chunk_tokens,
self.max_chunk_size, self.max_chunk_size,
batch_paragraphs=True, batch_paragraphs=True,
): ):
if self.check_word_count_and_token_count( if self.chunk_size + chunk_data["chunk_size"] <= self.max_chunk_size:
self.chunk_size, self.token_count, chunk_data
):
paragraph_chunks.append(chunk_data) paragraph_chunks.append(chunk_data)
self.chunk_size += chunk_data["word_count"] self.chunk_size += chunk_data["chunk_size"]
self.token_count += chunk_data["token_count"]
else: else:
if len(paragraph_chunks) == 0: if len(paragraph_chunks) == 0:
yield DocumentChunk( yield DocumentChunk(
id=chunk_data["chunk_id"], id=chunk_data["chunk_id"],
text=chunk_data["text"], text=chunk_data["text"],
word_count=chunk_data["word_count"], chunk_size=chunk_data["chunk_size"],
token_count=chunk_data["token_count"],
is_part_of=self.document, is_part_of=self.document,
chunk_index=self.chunk_index, chunk_index=self.chunk_index,
cut_type=chunk_data["cut_type"], cut_type=chunk_data["cut_type"],
@ -54,8 +44,7 @@ class TextChunker(Chunker):
NAMESPACE_OID, f"{str(self.document.id)}-{self.chunk_index}" NAMESPACE_OID, f"{str(self.document.id)}-{self.chunk_index}"
), ),
text=chunk_text, text=chunk_text,
word_count=self.chunk_size, chunk_size=self.chunk_size,
token_count=self.token_count,
is_part_of=self.document, is_part_of=self.document,
chunk_index=self.chunk_index, chunk_index=self.chunk_index,
cut_type=paragraph_chunks[len(paragraph_chunks) - 1]["cut_type"], cut_type=paragraph_chunks[len(paragraph_chunks) - 1]["cut_type"],
@ -68,8 +57,7 @@ class TextChunker(Chunker):
logger.error(e) logger.error(e)
raise e raise e
paragraph_chunks = [chunk_data] paragraph_chunks = [chunk_data]
self.chunk_size = chunk_data["word_count"] self.chunk_size = chunk_data["chunk_size"]
self.token_count = chunk_data["token_count"]
self.chunk_index += 1 self.chunk_index += 1
@ -78,8 +66,7 @@ class TextChunker(Chunker):
yield DocumentChunk( yield DocumentChunk(
id=uuid5(NAMESPACE_OID, f"{str(self.document.id)}-{self.chunk_index}"), id=uuid5(NAMESPACE_OID, f"{str(self.document.id)}-{self.chunk_index}"),
text=" ".join(chunk["text"] for chunk in paragraph_chunks), text=" ".join(chunk["text"] for chunk in paragraph_chunks),
word_count=self.chunk_size, chunk_size=self.chunk_size,
token_count=self.token_count,
is_part_of=self.document, is_part_of=self.document,
chunk_index=self.chunk_index, chunk_index=self.chunk_index,
cut_type=paragraph_chunks[len(paragraph_chunks) - 1]["cut_type"], cut_type=paragraph_chunks[len(paragraph_chunks) - 1]["cut_type"],

View file

@ -7,8 +7,7 @@ from cognee.modules.engine.models import Entity
class DocumentChunk(DataPoint): class DocumentChunk(DataPoint):
text: str text: str
word_count: int chunk_size: int
token_count: int
chunk_index: int chunk_index: int
cut_type: str cut_type: str
is_part_of: Document is_part_of: Document

View file

@ -11,13 +11,11 @@ class AudioDocument(Document):
result = get_llm_client().create_transcript(self.raw_data_location) result = get_llm_client().create_transcript(self.raw_data_location)
return result.text return result.text
def read(self, chunk_size: int, chunker_cls: Chunker, max_chunk_tokens: int): def read(self, chunker_cls: Chunker, max_chunk_size: int):
# Transcribe the audio file # Transcribe the audio file
text = self.create_transcript() text = self.create_transcript()
chunker = chunker_cls( chunker = chunker_cls(self, max_chunk_size=max_chunk_size, get_text=lambda: [text])
self, chunk_size=chunk_size, get_text=lambda: [text], max_chunk_tokens=max_chunk_tokens
)
yield from chunker.read() yield from chunker.read()

View file

@ -10,7 +10,5 @@ class Document(DataPoint):
mime_type: str mime_type: str
metadata: dict = {"index_fields": ["name"]} metadata: dict = {"index_fields": ["name"]}
def read( def read(self, chunker_cls: Chunker, max_chunk_size: int) -> str:
self, chunk_size: int, chunker_cls: Chunker, max_chunk_tokens: Optional[int] = None
) -> str:
pass pass

View file

@ -11,12 +11,10 @@ class ImageDocument(Document):
result = get_llm_client().transcribe_image(self.raw_data_location) result = get_llm_client().transcribe_image(self.raw_data_location)
return result.choices[0].message.content return result.choices[0].message.content
def read(self, chunk_size: int, chunker_cls: Chunker, max_chunk_tokens: int): def read(self, chunker_cls: Chunker, max_chunk_size: int):
# Transcribe the image file # Transcribe the image file
text = self.transcribe_image() text = self.transcribe_image()
chunker = chunker_cls( chunker = chunker_cls(self, get_text=lambda: [text], max_chunk_size=max_chunk_size)
self, chunk_size=chunk_size, get_text=lambda: [text], max_chunk_tokens=max_chunk_tokens
)
yield from chunker.read() yield from chunker.read()

View file

@ -7,7 +7,7 @@ from .Document import Document
class PdfDocument(Document): class PdfDocument(Document):
type: str = "pdf" type: str = "pdf"
def read(self, chunk_size: int, chunker_cls: Chunker, max_chunk_tokens: int): def read(self, chunker_cls: Chunker, max_chunk_size: int):
file = PdfReader(self.raw_data_location) file = PdfReader(self.raw_data_location)
def get_text(): def get_text():
@ -15,9 +15,7 @@ class PdfDocument(Document):
page_text = page.extract_text() page_text = page.extract_text()
yield page_text yield page_text
chunker = chunker_cls( chunker = chunker_cls(self, get_text=get_text, max_chunk_size=max_chunk_size)
self, chunk_size=chunk_size, get_text=get_text, max_chunk_tokens=max_chunk_tokens
)
yield from chunker.read() yield from chunker.read()

View file

@ -5,7 +5,7 @@ from cognee.modules.chunking.Chunker import Chunker
class TextDocument(Document): class TextDocument(Document):
type: str = "text" type: str = "text"
def read(self, chunk_size: int, chunker_cls: Chunker, max_chunk_tokens: int): def read(self, chunker_cls: Chunker, max_chunk_size: int):
def get_text(): def get_text():
with open(self.raw_data_location, mode="r", encoding="utf-8") as file: with open(self.raw_data_location, mode="r", encoding="utf-8") as file:
while True: while True:
@ -16,8 +16,6 @@ class TextDocument(Document):
yield text yield text
chunker = chunker_cls( chunker = chunker_cls(self, max_chunk_size=max_chunk_size, get_text=get_text)
self, chunk_size=chunk_size, get_text=get_text, max_chunk_tokens=max_chunk_tokens
)
yield from chunker.read() yield from chunker.read()

View file

@ -9,7 +9,7 @@ from .Document import Document
class UnstructuredDocument(Document): class UnstructuredDocument(Document):
type: str = "unstructured" type: str = "unstructured"
def read(self, chunk_size: int, chunker_cls: Chunker, max_chunk_tokens: int) -> str: def read(self, chunker_cls: Chunker, max_chunk_size: int) -> str:
def get_text(): def get_text():
try: try:
from unstructured.partition.auto import partition from unstructured.partition.auto import partition
@ -28,8 +28,6 @@ class UnstructuredDocument(Document):
yield text yield text
chunker = chunker_cls( chunker = chunker_cls(self, get_text=get_text, max_chunk_size=max_chunk_size)
self, chunk_size=chunk_size, get_text=get_text, max_chunk_tokens=max_chunk_tokens
)
yield from chunker.read() yield from chunker.read()

View file

@ -1,15 +1,12 @@
from typing import Any, Dict, Iterator from typing import Any, Dict, Iterator
from uuid import NAMESPACE_OID, uuid5 from uuid import NAMESPACE_OID, uuid5
from cognee.infrastructure.databases.vector import get_vector_engine
from .chunk_by_sentence import chunk_by_sentence from .chunk_by_sentence import chunk_by_sentence
def chunk_by_paragraph( def chunk_by_paragraph(
data: str, data: str,
max_chunk_tokens, max_chunk_size,
paragraph_length: int = 1024,
batch_paragraphs: bool = True, batch_paragraphs: bool = True,
) -> Iterator[Dict[str, Any]]: ) -> Iterator[Dict[str, Any]]:
""" """
@ -23,28 +20,19 @@ def chunk_by_paragraph(
- Remaining text at the end of the input will be yielded as a final chunk. - Remaining text at the end of the input will be yielded as a final chunk.
""" """
current_chunk = "" current_chunk = ""
current_word_count = 0
chunk_index = 0 chunk_index = 0
paragraph_ids = [] paragraph_ids = []
last_cut_type = None last_cut_type = None
current_token_count = 0 current_chunk_size = 0
for paragraph_id, sentence, word_count, end_type in chunk_by_sentence( for paragraph_id, sentence, sentence_size, end_type in chunk_by_sentence(
data, maximum_length=paragraph_length data, maximum_size=max_chunk_size
): ):
# Check if this sentence would exceed length limit if current_chunk_size > 0 and (current_chunk_size + sentence_size > max_chunk_size):
embedding_engine = get_vector_engine().embedding_engine
token_count = embedding_engine.tokenizer.count_tokens(sentence)
if current_word_count > 0 and (
current_word_count + word_count > paragraph_length
or current_token_count + token_count > max_chunk_tokens
):
# Yield current chunk # Yield current chunk
chunk_dict = { chunk_dict = {
"text": current_chunk, "text": current_chunk,
"word_count": current_word_count, "chunk_size": current_chunk_size,
"token_count": current_token_count,
"chunk_id": uuid5(NAMESPACE_OID, current_chunk), "chunk_id": uuid5(NAMESPACE_OID, current_chunk),
"paragraph_ids": paragraph_ids, "paragraph_ids": paragraph_ids,
"chunk_index": chunk_index, "chunk_index": chunk_index,
@ -56,22 +44,19 @@ def chunk_by_paragraph(
# Start new chunk with current sentence # Start new chunk with current sentence
paragraph_ids = [] paragraph_ids = []
current_chunk = "" current_chunk = ""
current_word_count = 0 current_chunk_size = 0
current_token_count = 0
chunk_index += 1 chunk_index += 1
paragraph_ids.append(paragraph_id) paragraph_ids.append(paragraph_id)
current_chunk += sentence current_chunk += sentence
current_word_count += word_count current_chunk_size += sentence_size
current_token_count += token_count
# Handle end of paragraph # Handle end of paragraph
if end_type in ("paragraph_end", "sentence_cut") and not batch_paragraphs: if end_type in ("paragraph_end", "sentence_cut") and not batch_paragraphs:
# For non-batch mode, yield each paragraph separately # For non-batch mode, yield each paragraph separately
chunk_dict = { chunk_dict = {
"text": current_chunk, "text": current_chunk,
"word_count": current_word_count, "chunk_size": current_chunk_size,
"token_count": current_token_count,
"paragraph_ids": paragraph_ids, "paragraph_ids": paragraph_ids,
"chunk_id": uuid5(NAMESPACE_OID, current_chunk), "chunk_id": uuid5(NAMESPACE_OID, current_chunk),
"chunk_index": chunk_index, "chunk_index": chunk_index,
@ -80,8 +65,7 @@ def chunk_by_paragraph(
yield chunk_dict yield chunk_dict
paragraph_ids = [] paragraph_ids = []
current_chunk = "" current_chunk = ""
current_word_count = 0 current_chunk_size = 0
current_token_count = 0
chunk_index += 1 chunk_index += 1
last_cut_type = end_type last_cut_type = end_type
@ -90,8 +74,7 @@ def chunk_by_paragraph(
if current_chunk: if current_chunk:
chunk_dict = { chunk_dict = {
"text": current_chunk, "text": current_chunk,
"word_count": current_word_count, "chunk_size": current_chunk_size,
"token_count": current_token_count,
"chunk_id": uuid5(NAMESPACE_OID, current_chunk), "chunk_id": uuid5(NAMESPACE_OID, current_chunk),
"paragraph_ids": paragraph_ids, "paragraph_ids": paragraph_ids,
"chunk_index": chunk_index, "chunk_index": chunk_index,

View file

@ -1,10 +1,19 @@
from uuid import uuid4, UUID from uuid import uuid4, UUID
from typing import Optional, Iterator, Tuple from typing import Optional, Iterator, Tuple
from .chunk_by_word import chunk_by_word from .chunk_by_word import chunk_by_word
from cognee.infrastructure.databases.vector.embeddings import get_embedding_engine
def get_word_size(word: str) -> int:
embedding_engine = get_embedding_engine()
if embedding_engine.tokenizer:
return embedding_engine.tokenizer.count_tokens(word)
else:
return 1
def chunk_by_sentence( def chunk_by_sentence(
data: str, maximum_length: Optional[int] = None data: str, maximum_size: Optional[int] = None
) -> Iterator[Tuple[UUID, str, int, Optional[str]]]: ) -> Iterator[Tuple[UUID, str, int, Optional[str]]]:
""" """
Splits the input text into sentences based on word-level processing, with optional sentence length constraints. Splits the input text into sentences based on word-level processing, with optional sentence length constraints.
@ -16,7 +25,7 @@ def chunk_by_sentence(
""" """
sentence = "" sentence = ""
paragraph_id = uuid4() paragraph_id = uuid4()
word_count = 0 sentence_size = 0
section_end = False section_end = False
word_type_state = None word_type_state = None
@ -25,8 +34,7 @@ def chunk_by_sentence(
# and words with the same characteristics connect it to a preceding # and words with the same characteristics connect it to a preceding
# word with word_type 'paragraph_end' or 'sentence_end' # word with word_type 'paragraph_end' or 'sentence_end'
for word, word_type in chunk_by_word(data): for word, word_type in chunk_by_word(data):
sentence += word word_size = get_word_size(word)
word_count += 1
if word_type in ["paragraph_end", "sentence_end"]: if word_type in ["paragraph_end", "sentence_end"]:
word_type_state = word_type word_type_state = word_type
@ -36,19 +44,31 @@ def chunk_by_sentence(
word_type_state = word_type word_type_state = word_type
break break
if word_type in ["paragraph_end", "sentence_end"] or ( if maximum_size and (sentence_size + word_size > maximum_size):
maximum_length and (word_count == maximum_length) yield (paragraph_id, sentence, sentence_size, word_type_state)
): sentence = word
yield (paragraph_id, sentence, word_count, word_type_state) sentence_size = word_size
sentence = ""
word_count = 0 elif word_type in ["paragraph_end", "sentence_end"]:
sentence += word
sentence_size += word_size
paragraph_id = uuid4() if word_type == "paragraph_end" else paragraph_id paragraph_id = uuid4() if word_type == "paragraph_end" else paragraph_id
yield (paragraph_id, sentence, sentence_size, word_type_state)
sentence = ""
sentence_size = 0
else:
sentence += word
sentence_size += word_size
if len(sentence) > 0: if len(sentence) > 0:
if maximum_size and sentence_size > maximum_size:
raise ValueError(f"Input word {word} longer than chunking size {maximum_size}.")
section_end = "sentence_cut" if word_type_state == "word" else word_type_state section_end = "sentence_cut" if word_type_state == "word" else word_type_state
yield ( yield (
paragraph_id, paragraph_id,
sentence, sentence,
word_count, sentence_size,
section_end, section_end,
) )

View file

@ -2,7 +2,7 @@ import re
from typing import Iterator, Tuple from typing import Iterator, Tuple
SENTENCE_ENDINGS = r"[.;!?…]" SENTENCE_ENDINGS = r"[.;!?…。!?]"
PARAGRAPH_ENDINGS = r"[\n\r]" PARAGRAPH_ENDINGS = r"[\n\r]"

View file

@ -26,8 +26,7 @@ async def update_document_token_count(document_id: UUID, token_count: int) -> No
async def extract_chunks_from_documents( async def extract_chunks_from_documents(
documents: list[Document], documents: list[Document],
max_chunk_tokens: int, max_chunk_size: int,
chunk_size: int = 1024,
chunker: Chunker = TextChunker, chunker: Chunker = TextChunker,
) -> AsyncGenerator: ) -> AsyncGenerator:
""" """
@ -39,10 +38,9 @@ async def extract_chunks_from_documents(
""" """
for document in documents: for document in documents:
document_token_count = 0 document_token_count = 0
for document_chunk in document.read( for document_chunk in document.read(max_chunk_size=max_chunk_size, chunker_cls=chunker):
chunk_size=chunk_size, chunker_cls=chunker, max_chunk_tokens=max_chunk_tokens document_token_count += document_chunk.chunk_size
):
document_token_count += document_chunk.token_count
yield document_chunk yield document_chunk
await update_document_token_count(document.id, document_token_count) await update_document_token_count(document.id, document_token_count)
# todo rita

View file

@ -2,6 +2,17 @@ import uuid
from unittest.mock import patch from unittest.mock import patch
from cognee.modules.chunking.TextChunker import TextChunker from cognee.modules.chunking.TextChunker import TextChunker
from cognee.modules.data.processing.document_types.AudioDocument import AudioDocument from cognee.modules.data.processing.document_types.AudioDocument import AudioDocument
import sys
chunk_by_sentence_module = sys.modules.get("cognee.tasks.chunks.chunk_by_sentence")
def mock_get_embedding_engine():
class MockEngine:
tokenizer = None
return MockEngine()
GROUND_TRUTH = [ GROUND_TRUTH = [
{"word_count": 57, "len_text": 353, "cut_type": "sentence_end"}, {"word_count": 57, "len_text": 353, "cut_type": "sentence_end"},
@ -24,7 +35,10 @@ TEST_TEXT = """
"The feature ships, Sarah. That's final.\"""" "The feature ships, Sarah. That's final.\""""
def test_AudioDocument(): @patch.object(
chunk_by_sentence_module, "get_embedding_engine", side_effect=mock_get_embedding_engine
)
def test_AudioDocument(mock_engine):
document = AudioDocument( document = AudioDocument(
id=uuid.uuid4(), id=uuid.uuid4(),
name="audio-dummy-test", name="audio-dummy-test",
@ -35,10 +49,10 @@ def test_AudioDocument():
with patch.object(AudioDocument, "create_transcript", return_value=TEST_TEXT): with patch.object(AudioDocument, "create_transcript", return_value=TEST_TEXT):
for ground_truth, paragraph_data in zip( for ground_truth, paragraph_data in zip(
GROUND_TRUTH, GROUND_TRUTH,
document.read(chunk_size=64, chunker_cls=TextChunker, max_chunk_tokens=512), document.read(chunker_cls=TextChunker, max_chunk_size=64),
): ):
assert ground_truth["word_count"] == paragraph_data.word_count, ( assert ground_truth["word_count"] == paragraph_data.chunk_size, (
f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }' f'{ground_truth["word_count"] = } != {paragraph_data.chunk_size = }'
) )
assert ground_truth["len_text"] == len(paragraph_data.text), ( assert ground_truth["len_text"] == len(paragraph_data.text), (
f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }' f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }'

View file

@ -2,6 +2,11 @@ import uuid
from unittest.mock import patch from unittest.mock import patch
from cognee.modules.chunking.TextChunker import TextChunker from cognee.modules.chunking.TextChunker import TextChunker
from cognee.modules.data.processing.document_types.ImageDocument import ImageDocument from cognee.modules.data.processing.document_types.ImageDocument import ImageDocument
from cognee.tests.integration.documents.AudioDocument_test import mock_get_embedding_engine
import sys
chunk_by_sentence_module = sys.modules.get("cognee.tasks.chunks.chunk_by_sentence")
GROUND_TRUTH = [ GROUND_TRUTH = [
{"word_count": 51, "len_text": 298, "cut_type": "sentence_end"}, {"word_count": 51, "len_text": 298, "cut_type": "sentence_end"},
@ -13,7 +18,10 @@ TEST_TEXT = """A dramatic confrontation unfolds as a red fox and river otter eng
The commotion has attracted an audience: a murder of crows has gathered in the low branches, their harsh calls adding to the chaos as they hop excitedly from limb to limb. One particularly bold crow dive-bombs the wrestling pair, causing both animals to momentarily freeze mid-tussle, creating a perfect snapshot of suspended actionthe fox's fur dripping wet, the otter's body coiled like a spring, and the crow's wings spread wide against the golden morning light.""" The commotion has attracted an audience: a murder of crows has gathered in the low branches, their harsh calls adding to the chaos as they hop excitedly from limb to limb. One particularly bold crow dive-bombs the wrestling pair, causing both animals to momentarily freeze mid-tussle, creating a perfect snapshot of suspended actionthe fox's fur dripping wet, the otter's body coiled like a spring, and the crow's wings spread wide against the golden morning light."""
def test_ImageDocument(): @patch.object(
chunk_by_sentence_module, "get_embedding_engine", side_effect=mock_get_embedding_engine
)
def test_ImageDocument(mock_engine):
document = ImageDocument( document = ImageDocument(
id=uuid.uuid4(), id=uuid.uuid4(),
name="image-dummy-test", name="image-dummy-test",
@ -24,10 +32,10 @@ def test_ImageDocument():
with patch.object(ImageDocument, "transcribe_image", return_value=TEST_TEXT): with patch.object(ImageDocument, "transcribe_image", return_value=TEST_TEXT):
for ground_truth, paragraph_data in zip( for ground_truth, paragraph_data in zip(
GROUND_TRUTH, GROUND_TRUTH,
document.read(chunk_size=64, chunker_cls=TextChunker, max_chunk_tokens=512), document.read(chunker_cls=TextChunker, max_chunk_size=64),
): ):
assert ground_truth["word_count"] == paragraph_data.word_count, ( assert ground_truth["word_count"] == paragraph_data.chunk_size, (
f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }' f'{ground_truth["word_count"] = } != {paragraph_data.chunk_size = }'
) )
assert ground_truth["len_text"] == len(paragraph_data.text), ( assert ground_truth["len_text"] == len(paragraph_data.text), (
f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }' f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }'

View file

@ -2,6 +2,12 @@ import os
import uuid import uuid
from cognee.modules.chunking.TextChunker import TextChunker from cognee.modules.chunking.TextChunker import TextChunker
from cognee.modules.data.processing.document_types.PdfDocument import PdfDocument from cognee.modules.data.processing.document_types.PdfDocument import PdfDocument
from cognee.tests.integration.documents.AudioDocument_test import mock_get_embedding_engine
from unittest.mock import patch
import sys
chunk_by_sentence_module = sys.modules.get("cognee.tasks.chunks.chunk_by_sentence")
GROUND_TRUTH = [ GROUND_TRUTH = [
{"word_count": 879, "len_text": 5607, "cut_type": "sentence_end"}, {"word_count": 879, "len_text": 5607, "cut_type": "sentence_end"},
@ -9,7 +15,10 @@ GROUND_TRUTH = [
] ]
def test_PdfDocument(): @patch.object(
chunk_by_sentence_module, "get_embedding_engine", side_effect=mock_get_embedding_engine
)
def test_PdfDocument(mock_engine):
test_file_path = os.path.join( test_file_path = os.path.join(
os.sep, os.sep,
*(os.path.dirname(__file__).split(os.sep)[:-2]), *(os.path.dirname(__file__).split(os.sep)[:-2]),
@ -25,10 +34,10 @@ def test_PdfDocument():
) )
for ground_truth, paragraph_data in zip( for ground_truth, paragraph_data in zip(
GROUND_TRUTH, document.read(chunk_size=1024, chunker_cls=TextChunker, max_chunk_tokens=2048) GROUND_TRUTH, document.read(chunker_cls=TextChunker, max_chunk_size=1024)
): ):
assert ground_truth["word_count"] == paragraph_data.word_count, ( assert ground_truth["word_count"] == paragraph_data.chunk_size, (
f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }' f'{ground_truth["word_count"] = } != {paragraph_data.chunk_size = }'
) )
assert ground_truth["len_text"] == len(paragraph_data.text), ( assert ground_truth["len_text"] == len(paragraph_data.text), (
f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }' f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }'

View file

@ -4,6 +4,12 @@ import uuid
import pytest import pytest
from cognee.modules.chunking.TextChunker import TextChunker from cognee.modules.chunking.TextChunker import TextChunker
from cognee.modules.data.processing.document_types.TextDocument import TextDocument from cognee.modules.data.processing.document_types.TextDocument import TextDocument
from unittest.mock import patch
from cognee.tests.integration.documents.AudioDocument_test import mock_get_embedding_engine
import sys
chunk_by_sentence_module = sys.modules.get("cognee.tasks.chunks.chunk_by_sentence")
GROUND_TRUTH = { GROUND_TRUTH = {
"code.txt": [ "code.txt": [
@ -21,7 +27,10 @@ GROUND_TRUTH = {
"input_file,chunk_size", "input_file,chunk_size",
[("code.txt", 256), ("Natural_language_processing.txt", 128)], [("code.txt", 256), ("Natural_language_processing.txt", 128)],
) )
def test_TextDocument(input_file, chunk_size): @patch.object(
chunk_by_sentence_module, "get_embedding_engine", side_effect=mock_get_embedding_engine
)
def test_TextDocument(mock_engine, input_file, chunk_size):
test_file_path = os.path.join( test_file_path = os.path.join(
os.sep, os.sep,
*(os.path.dirname(__file__).split(os.sep)[:-2]), *(os.path.dirname(__file__).split(os.sep)[:-2]),
@ -38,10 +47,10 @@ def test_TextDocument(input_file, chunk_size):
for ground_truth, paragraph_data in zip( for ground_truth, paragraph_data in zip(
GROUND_TRUTH[input_file], GROUND_TRUTH[input_file],
document.read(chunk_size=chunk_size, chunker_cls=TextChunker, max_chunk_tokens=1024), document.read(chunker_cls=TextChunker, max_chunk_size=chunk_size),
): ):
assert ground_truth["word_count"] == paragraph_data.word_count, ( assert ground_truth["word_count"] == paragraph_data.chunk_size, (
f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }' f'{ground_truth["word_count"] = } != {paragraph_data.chunk_size = }'
) )
assert ground_truth["len_text"] == len(paragraph_data.text), ( assert ground_truth["len_text"] == len(paragraph_data.text), (
f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }' f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }'

View file

@ -1,10 +1,18 @@
import os import os
import uuid import uuid
from unittest.mock import patch
from cognee.modules.chunking.TextChunker import TextChunker from cognee.modules.chunking.TextChunker import TextChunker
from cognee.modules.data.processing.document_types.UnstructuredDocument import UnstructuredDocument from cognee.modules.data.processing.document_types.UnstructuredDocument import UnstructuredDocument
from cognee.tests.integration.documents.AudioDocument_test import mock_get_embedding_engine
import sys
chunk_by_sentence_module = sys.modules.get("cognee.tasks.chunks.chunk_by_sentence")
def test_UnstructuredDocument(): @patch.object(
chunk_by_sentence_module, "get_embedding_engine", side_effect=mock_get_embedding_engine
)
def test_UnstructuredDocument(mock_engine):
# Define file paths of test data # Define file paths of test data
pptx_file_path = os.path.join( pptx_file_path = os.path.join(
os.sep, os.sep,
@ -68,30 +76,24 @@ def test_UnstructuredDocument():
) )
# Test PPTX # Test PPTX
for paragraph_data in pptx_document.read( for paragraph_data in pptx_document.read(chunker_cls=TextChunker, max_chunk_size=1024):
chunk_size=1024, chunker_cls=TextChunker, max_chunk_tokens=1024 assert 19 == paragraph_data.chunk_size, f" 19 != {paragraph_data.chunk_size = }"
):
assert 19 == paragraph_data.word_count, f" 19 != {paragraph_data.word_count = }"
assert 104 == len(paragraph_data.text), f" 104 != {len(paragraph_data.text) = }" assert 104 == len(paragraph_data.text), f" 104 != {len(paragraph_data.text) = }"
assert "sentence_cut" == paragraph_data.cut_type, ( assert "sentence_cut" == paragraph_data.cut_type, (
f" sentence_cut != {paragraph_data.cut_type = }" f" sentence_cut != {paragraph_data.cut_type = }"
) )
# Test DOCX # Test DOCX
for paragraph_data in docx_document.read( for paragraph_data in docx_document.read(chunker_cls=TextChunker, max_chunk_size=1024):
chunk_size=1024, chunker_cls=TextChunker, max_chunk_tokens=1024 assert 16 == paragraph_data.chunk_size, f" 16 != {paragraph_data.chunk_size = }"
):
assert 16 == paragraph_data.word_count, f" 16 != {paragraph_data.word_count = }"
assert 145 == len(paragraph_data.text), f" 145 != {len(paragraph_data.text) = }" assert 145 == len(paragraph_data.text), f" 145 != {len(paragraph_data.text) = }"
assert "sentence_end" == paragraph_data.cut_type, ( assert "sentence_end" == paragraph_data.cut_type, (
f" sentence_end != {paragraph_data.cut_type = }" f" sentence_end != {paragraph_data.cut_type = }"
) )
# TEST CSV # TEST CSV
for paragraph_data in csv_document.read( for paragraph_data in csv_document.read(chunker_cls=TextChunker, max_chunk_size=1024):
chunk_size=1024, chunker_cls=TextChunker, max_chunk_tokens=1024 assert 15 == paragraph_data.chunk_size, f" 15 != {paragraph_data.chunk_size = }"
):
assert 15 == paragraph_data.word_count, f" 15 != {paragraph_data.word_count = }"
assert "A A A A A A A A A,A A A A A A,A A" == paragraph_data.text, ( assert "A A A A A A A A A,A A A A A A,A A" == paragraph_data.text, (
f"Read text doesn't match expected text: {paragraph_data.text}" f"Read text doesn't match expected text: {paragraph_data.text}"
) )
@ -100,10 +102,8 @@ def test_UnstructuredDocument():
) )
# Test XLSX # Test XLSX
for paragraph_data in xlsx_document.read( for paragraph_data in xlsx_document.read(chunker_cls=TextChunker, max_chunk_size=1024):
chunk_size=1024, chunker_cls=TextChunker, max_chunk_tokens=1024 assert 36 == paragraph_data.chunk_size, f" 36 != {paragraph_data.chunk_size = }"
):
assert 36 == paragraph_data.word_count, f" 36 != {paragraph_data.word_count = }"
assert 171 == len(paragraph_data.text), f" 171 != {len(paragraph_data.text) = }" assert 171 == len(paragraph_data.text), f" 171 != {len(paragraph_data.text) = }"
assert "sentence_cut" == paragraph_data.cut_type, ( assert "sentence_cut" == paragraph_data.cut_type, (
f" sentence_cut != {paragraph_data.cut_type = }" f" sentence_cut != {paragraph_data.cut_type = }"

View file

@ -3,29 +3,26 @@ from itertools import product
import numpy as np import numpy as np
import pytest import pytest
from cognee.tasks.chunks import chunk_by_paragraph, chunk_by_word
from cognee.tests.unit.processing.chunks.test_input import INPUT_TEXTS from cognee.tests.unit.processing.chunks.test_input import INPUT_TEXTS
from cognee.infrastructure.databases.vector.embeddings import get_embedding_engine
from cognee.tasks.chunks import chunk_by_paragraph
paragraph_lengths = [64, 256, 1024]
batch_paragraphs_vals = [True, False] batch_paragraphs_vals = [True, False]
max_chunk_tokens_vals = [512, 1024, 4096] max_chunk_size_vals = [512, 1024, 4096]
@pytest.mark.parametrize( @pytest.mark.parametrize(
"input_text,max_chunk_tokens,paragraph_length,batch_paragraphs", "input_text,max_chunk_size,batch_paragraphs",
list( list(
product( product(
list(INPUT_TEXTS.values()), list(INPUT_TEXTS.values()),
max_chunk_tokens_vals, max_chunk_size_vals,
paragraph_lengths,
batch_paragraphs_vals, batch_paragraphs_vals,
) )
), ),
) )
def test_chunk_by_paragraph_isomorphism( def test_chunk_by_paragraph_isomorphism(input_text, max_chunk_size, batch_paragraphs):
input_text, max_chunk_tokens, paragraph_length, batch_paragraphs chunks = chunk_by_paragraph(input_text, max_chunk_size, batch_paragraphs)
):
chunks = chunk_by_paragraph(input_text, max_chunk_tokens, paragraph_length, batch_paragraphs)
reconstructed_text = "".join([chunk["text"] for chunk in chunks]) reconstructed_text = "".join([chunk["text"] for chunk in chunks])
assert reconstructed_text == input_text, ( assert reconstructed_text == input_text, (
f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }" f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }"
@ -33,52 +30,49 @@ def test_chunk_by_paragraph_isomorphism(
@pytest.mark.parametrize( @pytest.mark.parametrize(
"input_text,max_chunk_tokens,paragraph_length,batch_paragraphs", "input_text,max_chunk_size, batch_paragraphs",
list( list(
product( product(
list(INPUT_TEXTS.values()), list(INPUT_TEXTS.values()),
max_chunk_tokens_vals, max_chunk_size_vals,
paragraph_lengths,
batch_paragraphs_vals, batch_paragraphs_vals,
) )
), ),
) )
def test_paragraph_chunk_length(input_text, max_chunk_tokens, paragraph_length, batch_paragraphs): def test_paragraph_chunk_length(input_text, max_chunk_size, batch_paragraphs):
chunks = list( chunks = list(
chunk_by_paragraph( chunk_by_paragraph(
data=input_text, data=input_text,
max_chunk_tokens=max_chunk_tokens, max_chunk_size=max_chunk_size,
paragraph_length=paragraph_length,
batch_paragraphs=batch_paragraphs, batch_paragraphs=batch_paragraphs,
) )
) )
embedding_engine = get_embedding_engine()
chunk_lengths = np.array([len(list(chunk_by_word(chunk["text"]))) for chunk in chunks]) chunk_lengths = np.array(
[embedding_engine.tokenizer.count_tokens(chunk["text"]) for chunk in chunks]
)
larger_chunks = chunk_lengths[chunk_lengths > paragraph_length] larger_chunks = chunk_lengths[chunk_lengths > max_chunk_size]
assert np.all(chunk_lengths <= paragraph_length), ( assert np.all(chunk_lengths <= max_chunk_size), (
f"{paragraph_length = }: {larger_chunks} are too large" f"{max_chunk_size = }: {larger_chunks} are too large"
) )
@pytest.mark.parametrize( @pytest.mark.parametrize(
"input_text,max_chunk_tokens,paragraph_length,batch_paragraphs", "input_text,max_chunk_size,batch_paragraphs",
list( list(
product( product(
list(INPUT_TEXTS.values()), list(INPUT_TEXTS.values()),
max_chunk_tokens_vals, max_chunk_size_vals,
paragraph_lengths,
batch_paragraphs_vals, batch_paragraphs_vals,
) )
), ),
) )
def test_chunk_by_paragraph_chunk_numbering( def test_chunk_by_paragraph_chunk_numbering(input_text, max_chunk_size, batch_paragraphs):
input_text, max_chunk_tokens, paragraph_length, batch_paragraphs
):
chunks = chunk_by_paragraph( chunks = chunk_by_paragraph(
data=input_text, data=input_text,
max_chunk_tokens=max_chunk_tokens, max_chunk_size=max_chunk_size,
paragraph_length=paragraph_length,
batch_paragraphs=batch_paragraphs, batch_paragraphs=batch_paragraphs,
) )
chunk_indices = np.array([chunk["chunk_index"] for chunk in chunks]) chunk_indices = np.array([chunk["chunk_index"] for chunk in chunks])

View file

@ -1,37 +1,49 @@
from unittest.mock import patch
from cognee.tasks.chunks import chunk_by_paragraph from cognee.tasks.chunks import chunk_by_paragraph
import sys
chunk_by_sentence_module = sys.modules.get("cognee.tasks.chunks.chunk_by_sentence")
def mock_get_embedding_engine():
class MockEngine:
tokenizer = None
return MockEngine()
GROUND_TRUTH = { GROUND_TRUTH = {
"whole_text": [ "whole_text": [
{ {
"text": "This is example text. It contains multiple sentences.", "text": "This is example text. It contains multiple sentences.",
"word_count": 8, "chunk_size": 8,
"cut_type": "paragraph_end", "cut_type": "paragraph_end",
}, },
{ {
"text": "\nThis is a second paragraph. First two paragraphs are whole.", "text": "\nThis is a second paragraph. First two paragraphs are whole.",
"word_count": 10, "chunk_size": 10,
"cut_type": "paragraph_end", "cut_type": "paragraph_end",
}, },
{ {
"text": "\nThird paragraph is a bit longer and is finished with a dot.", "text": "\nThird paragraph is a bit longer and is finished with a dot.",
"word_count": 12, "chunk_size": 12,
"cut_type": "sentence_end", "cut_type": "sentence_end",
}, },
], ],
"cut_text": [ "cut_text": [
{ {
"text": "This is example text. It contains multiple sentences.", "text": "This is example text. It contains multiple sentences.",
"word_count": 8, "chunk_size": 8,
"cut_type": "paragraph_end", "cut_type": "paragraph_end",
}, },
{ {
"text": "\nThis is a second paragraph. First two paragraphs are whole.", "text": "\nThis is a second paragraph. First two paragraphs are whole.",
"word_count": 10, "chunk_size": 10,
"cut_type": "paragraph_end", "cut_type": "paragraph_end",
}, },
{ {
"text": "\nThird paragraph is cut and is missing the dot at the end", "text": "\nThird paragraph is cut and is missing the dot at the end",
"word_count": 12, "chunk_size": 12,
"cut_type": "sentence_cut", "cut_type": "sentence_cut",
}, },
], ],
@ -47,17 +59,18 @@ Third paragraph is cut and is missing the dot at the end""",
} }
def run_chunking_test(test_text, expected_chunks): @patch.object(
chunk_by_sentence_module, "get_embedding_engine", side_effect=mock_get_embedding_engine
)
def run_chunking_test(test_text, expected_chunks, mock_engine):
chunks = [] chunks = []
for chunk_data in chunk_by_paragraph( for chunk_data in chunk_by_paragraph(data=test_text, batch_paragraphs=False, max_chunk_size=12):
data=test_text, paragraph_length=12, batch_paragraphs=False, max_chunk_tokens=512
):
chunks.append(chunk_data) chunks.append(chunk_data)
assert len(chunks) == 3 assert len(chunks) == 3
for expected_chunks_item, chunk in zip(expected_chunks, chunks): for expected_chunks_item, chunk in zip(expected_chunks, chunks):
for key in ["text", "word_count", "cut_type"]: for key in ["text", "chunk_size", "cut_type"]:
assert chunk[key] == expected_chunks_item[key], ( assert chunk[key] == expected_chunks_item[key], (
f"{key = }: {chunk[key] = } != {expected_chunks_item[key] = }" f"{key = }: {chunk[key] = } != {expected_chunks_item[key] = }"
) )

View file

@ -3,10 +3,11 @@ from itertools import product
import numpy as np import numpy as np
import pytest import pytest
from cognee.tasks.chunks import chunk_by_sentence, chunk_by_word from cognee.tasks.chunks import chunk_by_sentence
from cognee.tests.unit.processing.chunks.test_input import INPUT_TEXTS from cognee.tests.unit.processing.chunks.test_input import INPUT_TEXTS_LONGWORDS, INPUT_TEXTS
from cognee.infrastructure.databases.vector.embeddings import get_embedding_engine
maximum_length_vals = [None, 8, 64] maximum_length_vals = [None, 16, 64]
@pytest.mark.parametrize( @pytest.mark.parametrize(
@ -33,9 +34,26 @@ def test_chunk_by_sentence_isomorphism(input_text, maximum_length):
def test_paragraph_chunk_length(input_text, maximum_length): def test_paragraph_chunk_length(input_text, maximum_length):
chunks = list(chunk_by_sentence(input_text, maximum_length)) chunks = list(chunk_by_sentence(input_text, maximum_length))
chunk_lengths = np.array([len(list(chunk_by_word(chunk[1]))) for chunk in chunks]) embedding_engine = get_embedding_engine()
chunk_lengths = np.array(
[embedding_engine.tokenizer.count_tokens(chunk[1]) for chunk in chunks]
)
larger_chunks = chunk_lengths[chunk_lengths > maximum_length] larger_chunks = chunk_lengths[chunk_lengths > maximum_length]
assert np.all(chunk_lengths <= maximum_length), ( assert np.all(chunk_lengths <= maximum_length), (
f"{maximum_length = }: {larger_chunks} are too large" f"{maximum_length = }: {larger_chunks} are too large"
) )
@pytest.mark.parametrize(
"input_text,maximum_length",
list(
product(
list(INPUT_TEXTS_LONGWORDS.values()),
[val for val in maximum_length_vals if val is not None],
)
),
)
def test_paragraph_chunk_long_input(input_text, maximum_length):
with pytest.raises(ValueError):
list(chunk_by_sentence(input_text, maximum_length))

View file

@ -2,7 +2,7 @@ import numpy as np
import pytest import pytest
from cognee.tasks.chunks import chunk_by_word from cognee.tasks.chunks import chunk_by_word
from cognee.tests.unit.processing.chunks.test_input import INPUT_TEXTS from cognee.tests.unit.processing.chunks.test_input import INPUT_TEXTS, INPUT_TEXTS_LONGWORDS
@pytest.mark.parametrize( @pytest.mark.parametrize(
@ -11,7 +11,7 @@ from cognee.tests.unit.processing.chunks.test_input import INPUT_TEXTS
INPUT_TEXTS["english_text"], INPUT_TEXTS["english_text"],
INPUT_TEXTS["english_lists"], INPUT_TEXTS["english_lists"],
INPUT_TEXTS["python_code"], INPUT_TEXTS["python_code"],
INPUT_TEXTS["chinese_text"], INPUT_TEXTS_LONGWORDS["chinese_text"],
], ],
) )
def test_chunk_by_word_isomorphism(input_text): def test_chunk_by_word_isomorphism(input_text):
@ -28,7 +28,7 @@ def test_chunk_by_word_isomorphism(input_text):
INPUT_TEXTS["english_text"], INPUT_TEXTS["english_text"],
INPUT_TEXTS["english_lists"], INPUT_TEXTS["english_lists"],
INPUT_TEXTS["python_code"], INPUT_TEXTS["python_code"],
INPUT_TEXTS["chinese_text"], INPUT_TEXTS_LONGWORDS["chinese_text"],
], ],
) )
def test_chunk_by_word_splits(input_text): def test_chunk_by_word_splits(input_text):

View file

@ -179,7 +179,6 @@ def pad(
mode: _ModeFunc, mode: _ModeFunc,
**kwargs: Any, **kwargs: Any,
) -> NDArray[Any]: ...""", ) -> NDArray[Any]: ...""",
"chinese_text": """在这个繁华的城市里,藏着一个古老的小巷,名叫杨柳巷。巷子两旁的青石板路已经被无数行人的脚步磨得发亮,斑驳的老墙上爬满了常青藤,给这个充满历史气息的小巷增添了一抹生机。每天清晨,巷子里都会飘出阵阵香气,那是张婆婆家的早点铺子散发出的包子和豆浆的味道。老店门前经常排着长队,有步履匆匆的上班族,也有悠闲散步的老人。巷子深处有一家传统的茶馆,古色古香的木桌椅上总是坐满了品茶聊天的街坊邻里。傍晚时分,夕阳的余晖洒在石板路上,为这个充满生活气息的小巷染上一层温暖的金色。街角的老榕树下,常常有卖唱的艺人在这里驻足,用沧桑的嗓音讲述着这座城市的故事。偶尔,还能看到三三两两的游客举着相机,试图捕捉这里独特的市井风情。这条看似普通的小巷,承载着太多市民的回忆和岁月的痕迹,它就像是这座城市的一个缩影,悄悄地诉说着曾经的故事。""",
"english_text": """O for that warning voice, which he who saw "english_text": """O for that warning voice, which he who saw
Th' Apocalyps, heard cry in Heaven aloud, Th' Apocalyps, heard cry in Heaven aloud,
Then when the Dragon, put to second rout, Then when the Dragon, put to second rout,
@ -282,3 +281,7 @@ For never can true reconcilement grow
Where wounds of deadly hate have peirc'd so deep: Where wounds of deadly hate have peirc'd so deep:
Which would but lead me to a worse relapse [ 100 ]""", Which would but lead me to a worse relapse [ 100 ]""",
} }
INPUT_TEXTS_LONGWORDS = {
"chinese_text": """在这个繁华的城市里,藏着一个古老的小巷,名叫杨柳巷。巷子两旁的青石板路已经被无数行人的脚步磨得发亮,斑驳的老墙上爬满了常青藤,给这个充满历史气息的小巷增添了一抹生机。每天清晨,巷子里都会飘出阵阵香气,那是张婆婆家的早点铺子散发出的包子和豆浆的味道。老店门前经常排着长队,有步履匆匆的上班族,也有悠闲散步的老人。巷子深处有一家传统的茶馆,古色古香的木桌椅上总是坐满了品茶聊天的街坊邻里。傍晚时分,夕阳的余晖洒在石板路上,为这个充满生活气息的小巷染上一层温暖的金色。街角的老榕树下,常常有卖唱的艺人在这里驻足,用沧桑的嗓音讲述着这座城市的故事。偶尔,还能看到三三两两的游客举着相机,试图捕捉这里独特的市井风情。这条看似普通的小巷,承载着太多市民的回忆和岁月的痕迹,它就像是这座城市的一个缩影,悄悄地诉说着曾经的故事。""",
}

File diff suppressed because one or more lines are too long