feat: Eliminate the use of max_chunk_tokens and use a unified max_chunk_size instead [cog-1381] (#626)
<!-- .github/pull_request_template.md --> ## Description <!-- Provide a clear description of the changes in this PR --> ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit - **Refactor** - Simplified text processing by unifying multiple size-related parameters into a single metric across chunking and extraction functionalities. - Streamlined logic for text segmentation by removing redundant calculations and checks, resulting in a more consistent chunk management process. - **Chores** - Removed the `modal` package as a dependency. - **Documentation** - Updated the README.md to include a new demo video link and clarified default environment variable settings. - Enhanced the CONTRIBUTING.md to improve clarity and engagement for potential contributors. - **Bug Fixes** - Improved handling of sentence-ending punctuation in text processing to include additional characters. - **Version Update** - Updated project version to 0.1.33 in the pyproject.toml file. <!-- end of auto-generated comment: release notes by coderabbit.ai -->
This commit is contained in:
parent
b78d9f196f
commit
c1f7b667d1
27 changed files with 285 additions and 219 deletions
|
|
@ -113,7 +113,7 @@ def generate_dataset_name(dataset_name: str) -> str:
|
|||
|
||||
|
||||
async def get_default_tasks( # TODO: Find out a better way to do this (Boris's comment)
|
||||
user: User = None, graph_model: BaseModel = KnowledgeGraph, chunk_size=1024, chunker=TextChunker
|
||||
user: User = None, graph_model: BaseModel = KnowledgeGraph, chunker=TextChunker
|
||||
) -> list[Task]:
|
||||
if user is None:
|
||||
user = await get_default_user()
|
||||
|
|
@ -125,9 +125,8 @@ async def get_default_tasks( # TODO: Find out a better way to do this (Boris's
|
|||
Task(check_permissions_on_documents, user=user, permissions=["write"]),
|
||||
Task(
|
||||
extract_chunks_from_documents,
|
||||
max_chunk_tokens=get_max_chunk_tokens(),
|
||||
max_chunk_size=get_max_chunk_tokens(),
|
||||
chunker=chunker,
|
||||
chunk_size=chunk_size,
|
||||
), # Extract text chunks based on the document type.
|
||||
Task(
|
||||
extract_graph_from_data, graph_model=graph_model, task_config={"batch_size": 10}
|
||||
|
|
|
|||
|
|
@ -55,5 +55,5 @@ class CorpusBuilderExecutor:
|
|||
|
||||
await cognee.add(self.raw_corpus)
|
||||
|
||||
tasks = await self.task_getter(chunk_size=chunk_size, chunker=TextChunker)
|
||||
tasks = await self.task_getter(chunker=TextChunker)
|
||||
await cognee.cognify(tasks=tasks)
|
||||
|
|
|
|||
|
|
@ -48,7 +48,6 @@ async def run_corpus_builder(params: dict, chunk_size=1024, chunker=TextChunker)
|
|||
)
|
||||
questions = await corpus_builder.build_corpus(
|
||||
limit=params.get("number_of_samples_in_corpus"),
|
||||
chunk_size=chunk_size,
|
||||
chunker=chunker,
|
||||
load_golden_context=params.get("evaluating_contexts"),
|
||||
)
|
||||
|
|
|
|||
|
|
@ -1,13 +1,12 @@
|
|||
class Chunker:
|
||||
def __init__(self, document, get_text: callable, max_chunk_tokens: int, chunk_size: int = 1024):
|
||||
def __init__(self, document, get_text: callable, max_chunk_size: int):
|
||||
self.chunk_index = 0
|
||||
self.chunk_size = 0
|
||||
self.token_count = 0
|
||||
|
||||
self.document = document
|
||||
self.max_chunk_size = chunk_size
|
||||
self.max_chunk_size = max_chunk_size
|
||||
self.get_text = get_text
|
||||
self.max_chunk_tokens = max_chunk_tokens
|
||||
|
||||
def read(self):
|
||||
raise NotImplementedError
|
||||
|
|
|
|||
|
|
@ -9,33 +9,23 @@ logger = logging.getLogger(__name__)
|
|||
|
||||
|
||||
class TextChunker(Chunker):
|
||||
def check_word_count_and_token_count(self, word_count_before, token_count_before, chunk_data):
|
||||
word_count_fits = word_count_before + chunk_data["word_count"] <= self.max_chunk_size
|
||||
token_count_fits = token_count_before + chunk_data["token_count"] <= self.max_chunk_tokens
|
||||
return word_count_fits and token_count_fits
|
||||
|
||||
def read(self):
|
||||
paragraph_chunks = []
|
||||
for content_text in self.get_text():
|
||||
for chunk_data in chunk_by_paragraph(
|
||||
content_text,
|
||||
self.max_chunk_tokens,
|
||||
self.max_chunk_size,
|
||||
batch_paragraphs=True,
|
||||
):
|
||||
if self.check_word_count_and_token_count(
|
||||
self.chunk_size, self.token_count, chunk_data
|
||||
):
|
||||
if self.chunk_size + chunk_data["chunk_size"] <= self.max_chunk_size:
|
||||
paragraph_chunks.append(chunk_data)
|
||||
self.chunk_size += chunk_data["word_count"]
|
||||
self.token_count += chunk_data["token_count"]
|
||||
self.chunk_size += chunk_data["chunk_size"]
|
||||
else:
|
||||
if len(paragraph_chunks) == 0:
|
||||
yield DocumentChunk(
|
||||
id=chunk_data["chunk_id"],
|
||||
text=chunk_data["text"],
|
||||
word_count=chunk_data["word_count"],
|
||||
token_count=chunk_data["token_count"],
|
||||
chunk_size=chunk_data["chunk_size"],
|
||||
is_part_of=self.document,
|
||||
chunk_index=self.chunk_index,
|
||||
cut_type=chunk_data["cut_type"],
|
||||
|
|
@ -54,8 +44,7 @@ class TextChunker(Chunker):
|
|||
NAMESPACE_OID, f"{str(self.document.id)}-{self.chunk_index}"
|
||||
),
|
||||
text=chunk_text,
|
||||
word_count=self.chunk_size,
|
||||
token_count=self.token_count,
|
||||
chunk_size=self.chunk_size,
|
||||
is_part_of=self.document,
|
||||
chunk_index=self.chunk_index,
|
||||
cut_type=paragraph_chunks[len(paragraph_chunks) - 1]["cut_type"],
|
||||
|
|
@ -68,8 +57,7 @@ class TextChunker(Chunker):
|
|||
logger.error(e)
|
||||
raise e
|
||||
paragraph_chunks = [chunk_data]
|
||||
self.chunk_size = chunk_data["word_count"]
|
||||
self.token_count = chunk_data["token_count"]
|
||||
self.chunk_size = chunk_data["chunk_size"]
|
||||
|
||||
self.chunk_index += 1
|
||||
|
||||
|
|
@ -78,8 +66,7 @@ class TextChunker(Chunker):
|
|||
yield DocumentChunk(
|
||||
id=uuid5(NAMESPACE_OID, f"{str(self.document.id)}-{self.chunk_index}"),
|
||||
text=" ".join(chunk["text"] for chunk in paragraph_chunks),
|
||||
word_count=self.chunk_size,
|
||||
token_count=self.token_count,
|
||||
chunk_size=self.chunk_size,
|
||||
is_part_of=self.document,
|
||||
chunk_index=self.chunk_index,
|
||||
cut_type=paragraph_chunks[len(paragraph_chunks) - 1]["cut_type"],
|
||||
|
|
|
|||
|
|
@ -7,8 +7,7 @@ from cognee.modules.engine.models import Entity
|
|||
|
||||
class DocumentChunk(DataPoint):
|
||||
text: str
|
||||
word_count: int
|
||||
token_count: int
|
||||
chunk_size: int
|
||||
chunk_index: int
|
||||
cut_type: str
|
||||
is_part_of: Document
|
||||
|
|
|
|||
|
|
@ -11,13 +11,11 @@ class AudioDocument(Document):
|
|||
result = get_llm_client().create_transcript(self.raw_data_location)
|
||||
return result.text
|
||||
|
||||
def read(self, chunk_size: int, chunker_cls: Chunker, max_chunk_tokens: int):
|
||||
def read(self, chunker_cls: Chunker, max_chunk_size: int):
|
||||
# Transcribe the audio file
|
||||
|
||||
text = self.create_transcript()
|
||||
|
||||
chunker = chunker_cls(
|
||||
self, chunk_size=chunk_size, get_text=lambda: [text], max_chunk_tokens=max_chunk_tokens
|
||||
)
|
||||
chunker = chunker_cls(self, max_chunk_size=max_chunk_size, get_text=lambda: [text])
|
||||
|
||||
yield from chunker.read()
|
||||
|
|
|
|||
|
|
@ -10,7 +10,5 @@ class Document(DataPoint):
|
|||
mime_type: str
|
||||
metadata: dict = {"index_fields": ["name"]}
|
||||
|
||||
def read(
|
||||
self, chunk_size: int, chunker_cls: Chunker, max_chunk_tokens: Optional[int] = None
|
||||
) -> str:
|
||||
def read(self, chunker_cls: Chunker, max_chunk_size: int) -> str:
|
||||
pass
|
||||
|
|
|
|||
|
|
@ -11,12 +11,10 @@ class ImageDocument(Document):
|
|||
result = get_llm_client().transcribe_image(self.raw_data_location)
|
||||
return result.choices[0].message.content
|
||||
|
||||
def read(self, chunk_size: int, chunker_cls: Chunker, max_chunk_tokens: int):
|
||||
def read(self, chunker_cls: Chunker, max_chunk_size: int):
|
||||
# Transcribe the image file
|
||||
text = self.transcribe_image()
|
||||
|
||||
chunker = chunker_cls(
|
||||
self, chunk_size=chunk_size, get_text=lambda: [text], max_chunk_tokens=max_chunk_tokens
|
||||
)
|
||||
chunker = chunker_cls(self, get_text=lambda: [text], max_chunk_size=max_chunk_size)
|
||||
|
||||
yield from chunker.read()
|
||||
|
|
|
|||
|
|
@ -7,7 +7,7 @@ from .Document import Document
|
|||
class PdfDocument(Document):
|
||||
type: str = "pdf"
|
||||
|
||||
def read(self, chunk_size: int, chunker_cls: Chunker, max_chunk_tokens: int):
|
||||
def read(self, chunker_cls: Chunker, max_chunk_size: int):
|
||||
file = PdfReader(self.raw_data_location)
|
||||
|
||||
def get_text():
|
||||
|
|
@ -15,9 +15,7 @@ class PdfDocument(Document):
|
|||
page_text = page.extract_text()
|
||||
yield page_text
|
||||
|
||||
chunker = chunker_cls(
|
||||
self, chunk_size=chunk_size, get_text=get_text, max_chunk_tokens=max_chunk_tokens
|
||||
)
|
||||
chunker = chunker_cls(self, get_text=get_text, max_chunk_size=max_chunk_size)
|
||||
|
||||
yield from chunker.read()
|
||||
|
||||
|
|
|
|||
|
|
@ -5,7 +5,7 @@ from cognee.modules.chunking.Chunker import Chunker
|
|||
class TextDocument(Document):
|
||||
type: str = "text"
|
||||
|
||||
def read(self, chunk_size: int, chunker_cls: Chunker, max_chunk_tokens: int):
|
||||
def read(self, chunker_cls: Chunker, max_chunk_size: int):
|
||||
def get_text():
|
||||
with open(self.raw_data_location, mode="r", encoding="utf-8") as file:
|
||||
while True:
|
||||
|
|
@ -16,8 +16,6 @@ class TextDocument(Document):
|
|||
|
||||
yield text
|
||||
|
||||
chunker = chunker_cls(
|
||||
self, chunk_size=chunk_size, get_text=get_text, max_chunk_tokens=max_chunk_tokens
|
||||
)
|
||||
chunker = chunker_cls(self, max_chunk_size=max_chunk_size, get_text=get_text)
|
||||
|
||||
yield from chunker.read()
|
||||
|
|
|
|||
|
|
@ -9,7 +9,7 @@ from .Document import Document
|
|||
class UnstructuredDocument(Document):
|
||||
type: str = "unstructured"
|
||||
|
||||
def read(self, chunk_size: int, chunker_cls: Chunker, max_chunk_tokens: int) -> str:
|
||||
def read(self, chunker_cls: Chunker, max_chunk_size: int) -> str:
|
||||
def get_text():
|
||||
try:
|
||||
from unstructured.partition.auto import partition
|
||||
|
|
@ -28,8 +28,6 @@ class UnstructuredDocument(Document):
|
|||
|
||||
yield text
|
||||
|
||||
chunker = chunker_cls(
|
||||
self, chunk_size=chunk_size, get_text=get_text, max_chunk_tokens=max_chunk_tokens
|
||||
)
|
||||
chunker = chunker_cls(self, get_text=get_text, max_chunk_size=max_chunk_size)
|
||||
|
||||
yield from chunker.read()
|
||||
|
|
|
|||
|
|
@ -1,15 +1,12 @@
|
|||
from typing import Any, Dict, Iterator
|
||||
from uuid import NAMESPACE_OID, uuid5
|
||||
|
||||
from cognee.infrastructure.databases.vector import get_vector_engine
|
||||
|
||||
from .chunk_by_sentence import chunk_by_sentence
|
||||
|
||||
|
||||
def chunk_by_paragraph(
|
||||
data: str,
|
||||
max_chunk_tokens,
|
||||
paragraph_length: int = 1024,
|
||||
max_chunk_size,
|
||||
batch_paragraphs: bool = True,
|
||||
) -> Iterator[Dict[str, Any]]:
|
||||
"""
|
||||
|
|
@ -23,28 +20,19 @@ def chunk_by_paragraph(
|
|||
- Remaining text at the end of the input will be yielded as a final chunk.
|
||||
"""
|
||||
current_chunk = ""
|
||||
current_word_count = 0
|
||||
chunk_index = 0
|
||||
paragraph_ids = []
|
||||
last_cut_type = None
|
||||
current_token_count = 0
|
||||
current_chunk_size = 0
|
||||
|
||||
for paragraph_id, sentence, word_count, end_type in chunk_by_sentence(
|
||||
data, maximum_length=paragraph_length
|
||||
for paragraph_id, sentence, sentence_size, end_type in chunk_by_sentence(
|
||||
data, maximum_size=max_chunk_size
|
||||
):
|
||||
# Check if this sentence would exceed length limit
|
||||
embedding_engine = get_vector_engine().embedding_engine
|
||||
token_count = embedding_engine.tokenizer.count_tokens(sentence)
|
||||
|
||||
if current_word_count > 0 and (
|
||||
current_word_count + word_count > paragraph_length
|
||||
or current_token_count + token_count > max_chunk_tokens
|
||||
):
|
||||
if current_chunk_size > 0 and (current_chunk_size + sentence_size > max_chunk_size):
|
||||
# Yield current chunk
|
||||
chunk_dict = {
|
||||
"text": current_chunk,
|
||||
"word_count": current_word_count,
|
||||
"token_count": current_token_count,
|
||||
"chunk_size": current_chunk_size,
|
||||
"chunk_id": uuid5(NAMESPACE_OID, current_chunk),
|
||||
"paragraph_ids": paragraph_ids,
|
||||
"chunk_index": chunk_index,
|
||||
|
|
@ -56,22 +44,19 @@ def chunk_by_paragraph(
|
|||
# Start new chunk with current sentence
|
||||
paragraph_ids = []
|
||||
current_chunk = ""
|
||||
current_word_count = 0
|
||||
current_token_count = 0
|
||||
current_chunk_size = 0
|
||||
chunk_index += 1
|
||||
|
||||
paragraph_ids.append(paragraph_id)
|
||||
current_chunk += sentence
|
||||
current_word_count += word_count
|
||||
current_token_count += token_count
|
||||
current_chunk_size += sentence_size
|
||||
|
||||
# Handle end of paragraph
|
||||
if end_type in ("paragraph_end", "sentence_cut") and not batch_paragraphs:
|
||||
# For non-batch mode, yield each paragraph separately
|
||||
chunk_dict = {
|
||||
"text": current_chunk,
|
||||
"word_count": current_word_count,
|
||||
"token_count": current_token_count,
|
||||
"chunk_size": current_chunk_size,
|
||||
"paragraph_ids": paragraph_ids,
|
||||
"chunk_id": uuid5(NAMESPACE_OID, current_chunk),
|
||||
"chunk_index": chunk_index,
|
||||
|
|
@ -80,8 +65,7 @@ def chunk_by_paragraph(
|
|||
yield chunk_dict
|
||||
paragraph_ids = []
|
||||
current_chunk = ""
|
||||
current_word_count = 0
|
||||
current_token_count = 0
|
||||
current_chunk_size = 0
|
||||
chunk_index += 1
|
||||
|
||||
last_cut_type = end_type
|
||||
|
|
@ -90,8 +74,7 @@ def chunk_by_paragraph(
|
|||
if current_chunk:
|
||||
chunk_dict = {
|
||||
"text": current_chunk,
|
||||
"word_count": current_word_count,
|
||||
"token_count": current_token_count,
|
||||
"chunk_size": current_chunk_size,
|
||||
"chunk_id": uuid5(NAMESPACE_OID, current_chunk),
|
||||
"paragraph_ids": paragraph_ids,
|
||||
"chunk_index": chunk_index,
|
||||
|
|
|
|||
|
|
@ -1,10 +1,19 @@
|
|||
from uuid import uuid4, UUID
|
||||
from typing import Optional, Iterator, Tuple
|
||||
from .chunk_by_word import chunk_by_word
|
||||
from cognee.infrastructure.databases.vector.embeddings import get_embedding_engine
|
||||
|
||||
|
||||
def get_word_size(word: str) -> int:
|
||||
embedding_engine = get_embedding_engine()
|
||||
if embedding_engine.tokenizer:
|
||||
return embedding_engine.tokenizer.count_tokens(word)
|
||||
else:
|
||||
return 1
|
||||
|
||||
|
||||
def chunk_by_sentence(
|
||||
data: str, maximum_length: Optional[int] = None
|
||||
data: str, maximum_size: Optional[int] = None
|
||||
) -> Iterator[Tuple[UUID, str, int, Optional[str]]]:
|
||||
"""
|
||||
Splits the input text into sentences based on word-level processing, with optional sentence length constraints.
|
||||
|
|
@ -16,7 +25,7 @@ def chunk_by_sentence(
|
|||
"""
|
||||
sentence = ""
|
||||
paragraph_id = uuid4()
|
||||
word_count = 0
|
||||
sentence_size = 0
|
||||
section_end = False
|
||||
word_type_state = None
|
||||
|
||||
|
|
@ -25,8 +34,7 @@ def chunk_by_sentence(
|
|||
# and words with the same characteristics connect it to a preceding
|
||||
# word with word_type 'paragraph_end' or 'sentence_end'
|
||||
for word, word_type in chunk_by_word(data):
|
||||
sentence += word
|
||||
word_count += 1
|
||||
word_size = get_word_size(word)
|
||||
|
||||
if word_type in ["paragraph_end", "sentence_end"]:
|
||||
word_type_state = word_type
|
||||
|
|
@ -36,19 +44,31 @@ def chunk_by_sentence(
|
|||
word_type_state = word_type
|
||||
break
|
||||
|
||||
if word_type in ["paragraph_end", "sentence_end"] or (
|
||||
maximum_length and (word_count == maximum_length)
|
||||
):
|
||||
yield (paragraph_id, sentence, word_count, word_type_state)
|
||||
sentence = ""
|
||||
word_count = 0
|
||||
if maximum_size and (sentence_size + word_size > maximum_size):
|
||||
yield (paragraph_id, sentence, sentence_size, word_type_state)
|
||||
sentence = word
|
||||
sentence_size = word_size
|
||||
|
||||
elif word_type in ["paragraph_end", "sentence_end"]:
|
||||
sentence += word
|
||||
sentence_size += word_size
|
||||
paragraph_id = uuid4() if word_type == "paragraph_end" else paragraph_id
|
||||
|
||||
yield (paragraph_id, sentence, sentence_size, word_type_state)
|
||||
sentence = ""
|
||||
sentence_size = 0
|
||||
else:
|
||||
sentence += word
|
||||
sentence_size += word_size
|
||||
|
||||
if len(sentence) > 0:
|
||||
if maximum_size and sentence_size > maximum_size:
|
||||
raise ValueError(f"Input word {word} longer than chunking size {maximum_size}.")
|
||||
|
||||
section_end = "sentence_cut" if word_type_state == "word" else word_type_state
|
||||
yield (
|
||||
paragraph_id,
|
||||
sentence,
|
||||
word_count,
|
||||
sentence_size,
|
||||
section_end,
|
||||
)
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@ import re
|
|||
from typing import Iterator, Tuple
|
||||
|
||||
|
||||
SENTENCE_ENDINGS = r"[.;!?…]"
|
||||
SENTENCE_ENDINGS = r"[.;!?…。!?]"
|
||||
PARAGRAPH_ENDINGS = r"[\n\r]"
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -26,8 +26,7 @@ async def update_document_token_count(document_id: UUID, token_count: int) -> No
|
|||
|
||||
async def extract_chunks_from_documents(
|
||||
documents: list[Document],
|
||||
max_chunk_tokens: int,
|
||||
chunk_size: int = 1024,
|
||||
max_chunk_size: int,
|
||||
chunker: Chunker = TextChunker,
|
||||
) -> AsyncGenerator:
|
||||
"""
|
||||
|
|
@ -39,10 +38,9 @@ async def extract_chunks_from_documents(
|
|||
"""
|
||||
for document in documents:
|
||||
document_token_count = 0
|
||||
for document_chunk in document.read(
|
||||
chunk_size=chunk_size, chunker_cls=chunker, max_chunk_tokens=max_chunk_tokens
|
||||
):
|
||||
document_token_count += document_chunk.token_count
|
||||
for document_chunk in document.read(max_chunk_size=max_chunk_size, chunker_cls=chunker):
|
||||
document_token_count += document_chunk.chunk_size
|
||||
yield document_chunk
|
||||
|
||||
await update_document_token_count(document.id, document_token_count)
|
||||
# todo rita
|
||||
|
|
|
|||
|
|
@ -2,6 +2,17 @@ import uuid
|
|||
from unittest.mock import patch
|
||||
from cognee.modules.chunking.TextChunker import TextChunker
|
||||
from cognee.modules.data.processing.document_types.AudioDocument import AudioDocument
|
||||
import sys
|
||||
|
||||
chunk_by_sentence_module = sys.modules.get("cognee.tasks.chunks.chunk_by_sentence")
|
||||
|
||||
|
||||
def mock_get_embedding_engine():
|
||||
class MockEngine:
|
||||
tokenizer = None
|
||||
|
||||
return MockEngine()
|
||||
|
||||
|
||||
GROUND_TRUTH = [
|
||||
{"word_count": 57, "len_text": 353, "cut_type": "sentence_end"},
|
||||
|
|
@ -24,7 +35,10 @@ TEST_TEXT = """
|
|||
"The feature ships, Sarah. That's final.\""""
|
||||
|
||||
|
||||
def test_AudioDocument():
|
||||
@patch.object(
|
||||
chunk_by_sentence_module, "get_embedding_engine", side_effect=mock_get_embedding_engine
|
||||
)
|
||||
def test_AudioDocument(mock_engine):
|
||||
document = AudioDocument(
|
||||
id=uuid.uuid4(),
|
||||
name="audio-dummy-test",
|
||||
|
|
@ -35,10 +49,10 @@ def test_AudioDocument():
|
|||
with patch.object(AudioDocument, "create_transcript", return_value=TEST_TEXT):
|
||||
for ground_truth, paragraph_data in zip(
|
||||
GROUND_TRUTH,
|
||||
document.read(chunk_size=64, chunker_cls=TextChunker, max_chunk_tokens=512),
|
||||
document.read(chunker_cls=TextChunker, max_chunk_size=64),
|
||||
):
|
||||
assert ground_truth["word_count"] == paragraph_data.word_count, (
|
||||
f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }'
|
||||
assert ground_truth["word_count"] == paragraph_data.chunk_size, (
|
||||
f'{ground_truth["word_count"] = } != {paragraph_data.chunk_size = }'
|
||||
)
|
||||
assert ground_truth["len_text"] == len(paragraph_data.text), (
|
||||
f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }'
|
||||
|
|
|
|||
|
|
@ -2,6 +2,11 @@ import uuid
|
|||
from unittest.mock import patch
|
||||
from cognee.modules.chunking.TextChunker import TextChunker
|
||||
from cognee.modules.data.processing.document_types.ImageDocument import ImageDocument
|
||||
from cognee.tests.integration.documents.AudioDocument_test import mock_get_embedding_engine
|
||||
import sys
|
||||
|
||||
chunk_by_sentence_module = sys.modules.get("cognee.tasks.chunks.chunk_by_sentence")
|
||||
|
||||
|
||||
GROUND_TRUTH = [
|
||||
{"word_count": 51, "len_text": 298, "cut_type": "sentence_end"},
|
||||
|
|
@ -13,7 +18,10 @@ TEST_TEXT = """A dramatic confrontation unfolds as a red fox and river otter eng
|
|||
The commotion has attracted an audience: a murder of crows has gathered in the low branches, their harsh calls adding to the chaos as they hop excitedly from limb to limb. One particularly bold crow dive-bombs the wrestling pair, causing both animals to momentarily freeze mid-tussle, creating a perfect snapshot of suspended action—the fox's fur dripping wet, the otter's body coiled like a spring, and the crow's wings spread wide against the golden morning light."""
|
||||
|
||||
|
||||
def test_ImageDocument():
|
||||
@patch.object(
|
||||
chunk_by_sentence_module, "get_embedding_engine", side_effect=mock_get_embedding_engine
|
||||
)
|
||||
def test_ImageDocument(mock_engine):
|
||||
document = ImageDocument(
|
||||
id=uuid.uuid4(),
|
||||
name="image-dummy-test",
|
||||
|
|
@ -24,10 +32,10 @@ def test_ImageDocument():
|
|||
with patch.object(ImageDocument, "transcribe_image", return_value=TEST_TEXT):
|
||||
for ground_truth, paragraph_data in zip(
|
||||
GROUND_TRUTH,
|
||||
document.read(chunk_size=64, chunker_cls=TextChunker, max_chunk_tokens=512),
|
||||
document.read(chunker_cls=TextChunker, max_chunk_size=64),
|
||||
):
|
||||
assert ground_truth["word_count"] == paragraph_data.word_count, (
|
||||
f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }'
|
||||
assert ground_truth["word_count"] == paragraph_data.chunk_size, (
|
||||
f'{ground_truth["word_count"] = } != {paragraph_data.chunk_size = }'
|
||||
)
|
||||
assert ground_truth["len_text"] == len(paragraph_data.text), (
|
||||
f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }'
|
||||
|
|
|
|||
|
|
@ -2,6 +2,12 @@ import os
|
|||
import uuid
|
||||
from cognee.modules.chunking.TextChunker import TextChunker
|
||||
from cognee.modules.data.processing.document_types.PdfDocument import PdfDocument
|
||||
from cognee.tests.integration.documents.AudioDocument_test import mock_get_embedding_engine
|
||||
from unittest.mock import patch
|
||||
import sys
|
||||
|
||||
chunk_by_sentence_module = sys.modules.get("cognee.tasks.chunks.chunk_by_sentence")
|
||||
|
||||
|
||||
GROUND_TRUTH = [
|
||||
{"word_count": 879, "len_text": 5607, "cut_type": "sentence_end"},
|
||||
|
|
@ -9,7 +15,10 @@ GROUND_TRUTH = [
|
|||
]
|
||||
|
||||
|
||||
def test_PdfDocument():
|
||||
@patch.object(
|
||||
chunk_by_sentence_module, "get_embedding_engine", side_effect=mock_get_embedding_engine
|
||||
)
|
||||
def test_PdfDocument(mock_engine):
|
||||
test_file_path = os.path.join(
|
||||
os.sep,
|
||||
*(os.path.dirname(__file__).split(os.sep)[:-2]),
|
||||
|
|
@ -25,10 +34,10 @@ def test_PdfDocument():
|
|||
)
|
||||
|
||||
for ground_truth, paragraph_data in zip(
|
||||
GROUND_TRUTH, document.read(chunk_size=1024, chunker_cls=TextChunker, max_chunk_tokens=2048)
|
||||
GROUND_TRUTH, document.read(chunker_cls=TextChunker, max_chunk_size=1024)
|
||||
):
|
||||
assert ground_truth["word_count"] == paragraph_data.word_count, (
|
||||
f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }'
|
||||
assert ground_truth["word_count"] == paragraph_data.chunk_size, (
|
||||
f'{ground_truth["word_count"] = } != {paragraph_data.chunk_size = }'
|
||||
)
|
||||
assert ground_truth["len_text"] == len(paragraph_data.text), (
|
||||
f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }'
|
||||
|
|
|
|||
|
|
@ -4,6 +4,12 @@ import uuid
|
|||
import pytest
|
||||
from cognee.modules.chunking.TextChunker import TextChunker
|
||||
from cognee.modules.data.processing.document_types.TextDocument import TextDocument
|
||||
from unittest.mock import patch
|
||||
from cognee.tests.integration.documents.AudioDocument_test import mock_get_embedding_engine
|
||||
import sys
|
||||
|
||||
chunk_by_sentence_module = sys.modules.get("cognee.tasks.chunks.chunk_by_sentence")
|
||||
|
||||
|
||||
GROUND_TRUTH = {
|
||||
"code.txt": [
|
||||
|
|
@ -21,7 +27,10 @@ GROUND_TRUTH = {
|
|||
"input_file,chunk_size",
|
||||
[("code.txt", 256), ("Natural_language_processing.txt", 128)],
|
||||
)
|
||||
def test_TextDocument(input_file, chunk_size):
|
||||
@patch.object(
|
||||
chunk_by_sentence_module, "get_embedding_engine", side_effect=mock_get_embedding_engine
|
||||
)
|
||||
def test_TextDocument(mock_engine, input_file, chunk_size):
|
||||
test_file_path = os.path.join(
|
||||
os.sep,
|
||||
*(os.path.dirname(__file__).split(os.sep)[:-2]),
|
||||
|
|
@ -38,10 +47,10 @@ def test_TextDocument(input_file, chunk_size):
|
|||
|
||||
for ground_truth, paragraph_data in zip(
|
||||
GROUND_TRUTH[input_file],
|
||||
document.read(chunk_size=chunk_size, chunker_cls=TextChunker, max_chunk_tokens=1024),
|
||||
document.read(chunker_cls=TextChunker, max_chunk_size=chunk_size),
|
||||
):
|
||||
assert ground_truth["word_count"] == paragraph_data.word_count, (
|
||||
f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }'
|
||||
assert ground_truth["word_count"] == paragraph_data.chunk_size, (
|
||||
f'{ground_truth["word_count"] = } != {paragraph_data.chunk_size = }'
|
||||
)
|
||||
assert ground_truth["len_text"] == len(paragraph_data.text), (
|
||||
f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }'
|
||||
|
|
|
|||
|
|
@ -1,10 +1,18 @@
|
|||
import os
|
||||
import uuid
|
||||
from unittest.mock import patch
|
||||
from cognee.modules.chunking.TextChunker import TextChunker
|
||||
from cognee.modules.data.processing.document_types.UnstructuredDocument import UnstructuredDocument
|
||||
from cognee.tests.integration.documents.AudioDocument_test import mock_get_embedding_engine
|
||||
import sys
|
||||
|
||||
chunk_by_sentence_module = sys.modules.get("cognee.tasks.chunks.chunk_by_sentence")
|
||||
|
||||
|
||||
def test_UnstructuredDocument():
|
||||
@patch.object(
|
||||
chunk_by_sentence_module, "get_embedding_engine", side_effect=mock_get_embedding_engine
|
||||
)
|
||||
def test_UnstructuredDocument(mock_engine):
|
||||
# Define file paths of test data
|
||||
pptx_file_path = os.path.join(
|
||||
os.sep,
|
||||
|
|
@ -68,30 +76,24 @@ def test_UnstructuredDocument():
|
|||
)
|
||||
|
||||
# Test PPTX
|
||||
for paragraph_data in pptx_document.read(
|
||||
chunk_size=1024, chunker_cls=TextChunker, max_chunk_tokens=1024
|
||||
):
|
||||
assert 19 == paragraph_data.word_count, f" 19 != {paragraph_data.word_count = }"
|
||||
for paragraph_data in pptx_document.read(chunker_cls=TextChunker, max_chunk_size=1024):
|
||||
assert 19 == paragraph_data.chunk_size, f" 19 != {paragraph_data.chunk_size = }"
|
||||
assert 104 == len(paragraph_data.text), f" 104 != {len(paragraph_data.text) = }"
|
||||
assert "sentence_cut" == paragraph_data.cut_type, (
|
||||
f" sentence_cut != {paragraph_data.cut_type = }"
|
||||
)
|
||||
|
||||
# Test DOCX
|
||||
for paragraph_data in docx_document.read(
|
||||
chunk_size=1024, chunker_cls=TextChunker, max_chunk_tokens=1024
|
||||
):
|
||||
assert 16 == paragraph_data.word_count, f" 16 != {paragraph_data.word_count = }"
|
||||
for paragraph_data in docx_document.read(chunker_cls=TextChunker, max_chunk_size=1024):
|
||||
assert 16 == paragraph_data.chunk_size, f" 16 != {paragraph_data.chunk_size = }"
|
||||
assert 145 == len(paragraph_data.text), f" 145 != {len(paragraph_data.text) = }"
|
||||
assert "sentence_end" == paragraph_data.cut_type, (
|
||||
f" sentence_end != {paragraph_data.cut_type = }"
|
||||
)
|
||||
|
||||
# TEST CSV
|
||||
for paragraph_data in csv_document.read(
|
||||
chunk_size=1024, chunker_cls=TextChunker, max_chunk_tokens=1024
|
||||
):
|
||||
assert 15 == paragraph_data.word_count, f" 15 != {paragraph_data.word_count = }"
|
||||
for paragraph_data in csv_document.read(chunker_cls=TextChunker, max_chunk_size=1024):
|
||||
assert 15 == paragraph_data.chunk_size, f" 15 != {paragraph_data.chunk_size = }"
|
||||
assert "A A A A A A A A A,A A A A A A,A A" == paragraph_data.text, (
|
||||
f"Read text doesn't match expected text: {paragraph_data.text}"
|
||||
)
|
||||
|
|
@ -100,10 +102,8 @@ def test_UnstructuredDocument():
|
|||
)
|
||||
|
||||
# Test XLSX
|
||||
for paragraph_data in xlsx_document.read(
|
||||
chunk_size=1024, chunker_cls=TextChunker, max_chunk_tokens=1024
|
||||
):
|
||||
assert 36 == paragraph_data.word_count, f" 36 != {paragraph_data.word_count = }"
|
||||
for paragraph_data in xlsx_document.read(chunker_cls=TextChunker, max_chunk_size=1024):
|
||||
assert 36 == paragraph_data.chunk_size, f" 36 != {paragraph_data.chunk_size = }"
|
||||
assert 171 == len(paragraph_data.text), f" 171 != {len(paragraph_data.text) = }"
|
||||
assert "sentence_cut" == paragraph_data.cut_type, (
|
||||
f" sentence_cut != {paragraph_data.cut_type = }"
|
||||
|
|
|
|||
|
|
@ -3,29 +3,26 @@ from itertools import product
|
|||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from cognee.tasks.chunks import chunk_by_paragraph, chunk_by_word
|
||||
from cognee.tests.unit.processing.chunks.test_input import INPUT_TEXTS
|
||||
from cognee.infrastructure.databases.vector.embeddings import get_embedding_engine
|
||||
from cognee.tasks.chunks import chunk_by_paragraph
|
||||
|
||||
paragraph_lengths = [64, 256, 1024]
|
||||
batch_paragraphs_vals = [True, False]
|
||||
max_chunk_tokens_vals = [512, 1024, 4096]
|
||||
max_chunk_size_vals = [512, 1024, 4096]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"input_text,max_chunk_tokens,paragraph_length,batch_paragraphs",
|
||||
"input_text,max_chunk_size,batch_paragraphs",
|
||||
list(
|
||||
product(
|
||||
list(INPUT_TEXTS.values()),
|
||||
max_chunk_tokens_vals,
|
||||
paragraph_lengths,
|
||||
max_chunk_size_vals,
|
||||
batch_paragraphs_vals,
|
||||
)
|
||||
),
|
||||
)
|
||||
def test_chunk_by_paragraph_isomorphism(
|
||||
input_text, max_chunk_tokens, paragraph_length, batch_paragraphs
|
||||
):
|
||||
chunks = chunk_by_paragraph(input_text, max_chunk_tokens, paragraph_length, batch_paragraphs)
|
||||
def test_chunk_by_paragraph_isomorphism(input_text, max_chunk_size, batch_paragraphs):
|
||||
chunks = chunk_by_paragraph(input_text, max_chunk_size, batch_paragraphs)
|
||||
reconstructed_text = "".join([chunk["text"] for chunk in chunks])
|
||||
assert reconstructed_text == input_text, (
|
||||
f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }"
|
||||
|
|
@ -33,52 +30,49 @@ def test_chunk_by_paragraph_isomorphism(
|
|||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"input_text,max_chunk_tokens,paragraph_length,batch_paragraphs",
|
||||
"input_text,max_chunk_size, batch_paragraphs",
|
||||
list(
|
||||
product(
|
||||
list(INPUT_TEXTS.values()),
|
||||
max_chunk_tokens_vals,
|
||||
paragraph_lengths,
|
||||
max_chunk_size_vals,
|
||||
batch_paragraphs_vals,
|
||||
)
|
||||
),
|
||||
)
|
||||
def test_paragraph_chunk_length(input_text, max_chunk_tokens, paragraph_length, batch_paragraphs):
|
||||
def test_paragraph_chunk_length(input_text, max_chunk_size, batch_paragraphs):
|
||||
chunks = list(
|
||||
chunk_by_paragraph(
|
||||
data=input_text,
|
||||
max_chunk_tokens=max_chunk_tokens,
|
||||
paragraph_length=paragraph_length,
|
||||
max_chunk_size=max_chunk_size,
|
||||
batch_paragraphs=batch_paragraphs,
|
||||
)
|
||||
)
|
||||
embedding_engine = get_embedding_engine()
|
||||
|
||||
chunk_lengths = np.array([len(list(chunk_by_word(chunk["text"]))) for chunk in chunks])
|
||||
chunk_lengths = np.array(
|
||||
[embedding_engine.tokenizer.count_tokens(chunk["text"]) for chunk in chunks]
|
||||
)
|
||||
|
||||
larger_chunks = chunk_lengths[chunk_lengths > paragraph_length]
|
||||
assert np.all(chunk_lengths <= paragraph_length), (
|
||||
f"{paragraph_length = }: {larger_chunks} are too large"
|
||||
larger_chunks = chunk_lengths[chunk_lengths > max_chunk_size]
|
||||
assert np.all(chunk_lengths <= max_chunk_size), (
|
||||
f"{max_chunk_size = }: {larger_chunks} are too large"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"input_text,max_chunk_tokens,paragraph_length,batch_paragraphs",
|
||||
"input_text,max_chunk_size,batch_paragraphs",
|
||||
list(
|
||||
product(
|
||||
list(INPUT_TEXTS.values()),
|
||||
max_chunk_tokens_vals,
|
||||
paragraph_lengths,
|
||||
max_chunk_size_vals,
|
||||
batch_paragraphs_vals,
|
||||
)
|
||||
),
|
||||
)
|
||||
def test_chunk_by_paragraph_chunk_numbering(
|
||||
input_text, max_chunk_tokens, paragraph_length, batch_paragraphs
|
||||
):
|
||||
def test_chunk_by_paragraph_chunk_numbering(input_text, max_chunk_size, batch_paragraphs):
|
||||
chunks = chunk_by_paragraph(
|
||||
data=input_text,
|
||||
max_chunk_tokens=max_chunk_tokens,
|
||||
paragraph_length=paragraph_length,
|
||||
max_chunk_size=max_chunk_size,
|
||||
batch_paragraphs=batch_paragraphs,
|
||||
)
|
||||
chunk_indices = np.array([chunk["chunk_index"] for chunk in chunks])
|
||||
|
|
|
|||
|
|
@ -1,37 +1,49 @@
|
|||
from unittest.mock import patch
|
||||
from cognee.tasks.chunks import chunk_by_paragraph
|
||||
import sys
|
||||
|
||||
chunk_by_sentence_module = sys.modules.get("cognee.tasks.chunks.chunk_by_sentence")
|
||||
|
||||
|
||||
def mock_get_embedding_engine():
|
||||
class MockEngine:
|
||||
tokenizer = None
|
||||
|
||||
return MockEngine()
|
||||
|
||||
|
||||
GROUND_TRUTH = {
|
||||
"whole_text": [
|
||||
{
|
||||
"text": "This is example text. It contains multiple sentences.",
|
||||
"word_count": 8,
|
||||
"chunk_size": 8,
|
||||
"cut_type": "paragraph_end",
|
||||
},
|
||||
{
|
||||
"text": "\nThis is a second paragraph. First two paragraphs are whole.",
|
||||
"word_count": 10,
|
||||
"chunk_size": 10,
|
||||
"cut_type": "paragraph_end",
|
||||
},
|
||||
{
|
||||
"text": "\nThird paragraph is a bit longer and is finished with a dot.",
|
||||
"word_count": 12,
|
||||
"chunk_size": 12,
|
||||
"cut_type": "sentence_end",
|
||||
},
|
||||
],
|
||||
"cut_text": [
|
||||
{
|
||||
"text": "This is example text. It contains multiple sentences.",
|
||||
"word_count": 8,
|
||||
"chunk_size": 8,
|
||||
"cut_type": "paragraph_end",
|
||||
},
|
||||
{
|
||||
"text": "\nThis is a second paragraph. First two paragraphs are whole.",
|
||||
"word_count": 10,
|
||||
"chunk_size": 10,
|
||||
"cut_type": "paragraph_end",
|
||||
},
|
||||
{
|
||||
"text": "\nThird paragraph is cut and is missing the dot at the end",
|
||||
"word_count": 12,
|
||||
"chunk_size": 12,
|
||||
"cut_type": "sentence_cut",
|
||||
},
|
||||
],
|
||||
|
|
@ -47,17 +59,18 @@ Third paragraph is cut and is missing the dot at the end""",
|
|||
}
|
||||
|
||||
|
||||
def run_chunking_test(test_text, expected_chunks):
|
||||
@patch.object(
|
||||
chunk_by_sentence_module, "get_embedding_engine", side_effect=mock_get_embedding_engine
|
||||
)
|
||||
def run_chunking_test(test_text, expected_chunks, mock_engine):
|
||||
chunks = []
|
||||
for chunk_data in chunk_by_paragraph(
|
||||
data=test_text, paragraph_length=12, batch_paragraphs=False, max_chunk_tokens=512
|
||||
):
|
||||
for chunk_data in chunk_by_paragraph(data=test_text, batch_paragraphs=False, max_chunk_size=12):
|
||||
chunks.append(chunk_data)
|
||||
|
||||
assert len(chunks) == 3
|
||||
|
||||
for expected_chunks_item, chunk in zip(expected_chunks, chunks):
|
||||
for key in ["text", "word_count", "cut_type"]:
|
||||
for key in ["text", "chunk_size", "cut_type"]:
|
||||
assert chunk[key] == expected_chunks_item[key], (
|
||||
f"{key = }: {chunk[key] = } != {expected_chunks_item[key] = }"
|
||||
)
|
||||
|
|
|
|||
|
|
@ -3,10 +3,11 @@ from itertools import product
|
|||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from cognee.tasks.chunks import chunk_by_sentence, chunk_by_word
|
||||
from cognee.tests.unit.processing.chunks.test_input import INPUT_TEXTS
|
||||
from cognee.tasks.chunks import chunk_by_sentence
|
||||
from cognee.tests.unit.processing.chunks.test_input import INPUT_TEXTS_LONGWORDS, INPUT_TEXTS
|
||||
from cognee.infrastructure.databases.vector.embeddings import get_embedding_engine
|
||||
|
||||
maximum_length_vals = [None, 8, 64]
|
||||
maximum_length_vals = [None, 16, 64]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
|
|
@ -33,9 +34,26 @@ def test_chunk_by_sentence_isomorphism(input_text, maximum_length):
|
|||
def test_paragraph_chunk_length(input_text, maximum_length):
|
||||
chunks = list(chunk_by_sentence(input_text, maximum_length))
|
||||
|
||||
chunk_lengths = np.array([len(list(chunk_by_word(chunk[1]))) for chunk in chunks])
|
||||
embedding_engine = get_embedding_engine()
|
||||
chunk_lengths = np.array(
|
||||
[embedding_engine.tokenizer.count_tokens(chunk[1]) for chunk in chunks]
|
||||
)
|
||||
|
||||
larger_chunks = chunk_lengths[chunk_lengths > maximum_length]
|
||||
assert np.all(chunk_lengths <= maximum_length), (
|
||||
f"{maximum_length = }: {larger_chunks} are too large"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"input_text,maximum_length",
|
||||
list(
|
||||
product(
|
||||
list(INPUT_TEXTS_LONGWORDS.values()),
|
||||
[val for val in maximum_length_vals if val is not None],
|
||||
)
|
||||
),
|
||||
)
|
||||
def test_paragraph_chunk_long_input(input_text, maximum_length):
|
||||
with pytest.raises(ValueError):
|
||||
list(chunk_by_sentence(input_text, maximum_length))
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@ import numpy as np
|
|||
import pytest
|
||||
|
||||
from cognee.tasks.chunks import chunk_by_word
|
||||
from cognee.tests.unit.processing.chunks.test_input import INPUT_TEXTS
|
||||
from cognee.tests.unit.processing.chunks.test_input import INPUT_TEXTS, INPUT_TEXTS_LONGWORDS
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
|
|
@ -11,7 +11,7 @@ from cognee.tests.unit.processing.chunks.test_input import INPUT_TEXTS
|
|||
INPUT_TEXTS["english_text"],
|
||||
INPUT_TEXTS["english_lists"],
|
||||
INPUT_TEXTS["python_code"],
|
||||
INPUT_TEXTS["chinese_text"],
|
||||
INPUT_TEXTS_LONGWORDS["chinese_text"],
|
||||
],
|
||||
)
|
||||
def test_chunk_by_word_isomorphism(input_text):
|
||||
|
|
@ -28,7 +28,7 @@ def test_chunk_by_word_isomorphism(input_text):
|
|||
INPUT_TEXTS["english_text"],
|
||||
INPUT_TEXTS["english_lists"],
|
||||
INPUT_TEXTS["python_code"],
|
||||
INPUT_TEXTS["chinese_text"],
|
||||
INPUT_TEXTS_LONGWORDS["chinese_text"],
|
||||
],
|
||||
)
|
||||
def test_chunk_by_word_splits(input_text):
|
||||
|
|
|
|||
|
|
@ -179,7 +179,6 @@ def pad(
|
|||
mode: _ModeFunc,
|
||||
**kwargs: Any,
|
||||
) -> NDArray[Any]: ...""",
|
||||
"chinese_text": """在这个繁华的城市里,藏着一个古老的小巷,名叫杨柳巷。巷子两旁的青石板路已经被无数行人的脚步磨得发亮,斑驳的老墙上爬满了常青藤,给这个充满历史气息的小巷增添了一抹生机。每天清晨,巷子里都会飘出阵阵香气,那是张婆婆家的早点铺子散发出的包子和豆浆的味道。老店门前经常排着长队,有步履匆匆的上班族,也有悠闲散步的老人。巷子深处有一家传统的茶馆,古色古香的木桌椅上总是坐满了品茶聊天的街坊邻里。傍晚时分,夕阳的余晖洒在石板路上,为这个充满生活气息的小巷染上一层温暖的金色。街角的老榕树下,常常有卖唱的艺人在这里驻足,用沧桑的嗓音讲述着这座城市的故事。偶尔,还能看到三三两两的游客举着相机,试图捕捉这里独特的市井风情。这条看似普通的小巷,承载着太多市民的回忆和岁月的痕迹,它就像是这座城市的一个缩影,悄悄地诉说着曾经的故事。""",
|
||||
"english_text": """O for that warning voice, which he who saw
|
||||
Th' Apocalyps, heard cry in Heaven aloud,
|
||||
Then when the Dragon, put to second rout,
|
||||
|
|
@ -282,3 +281,7 @@ For never can true reconcilement grow
|
|||
Where wounds of deadly hate have peirc'd so deep:
|
||||
Which would but lead me to a worse relapse [ 100 ]""",
|
||||
}
|
||||
|
||||
INPUT_TEXTS_LONGWORDS = {
|
||||
"chinese_text": """在这个繁华的城市里,藏着一个古老的小巷,名叫杨柳巷。巷子两旁的青石板路已经被无数行人的脚步磨得发亮,斑驳的老墙上爬满了常青藤,给这个充满历史气息的小巷增添了一抹生机。每天清晨,巷子里都会飘出阵阵香气,那是张婆婆家的早点铺子散发出的包子和豆浆的味道。老店门前经常排着长队,有步履匆匆的上班族,也有悠闲散步的老人。巷子深处有一家传统的茶馆,古色古香的木桌椅上总是坐满了品茶聊天的街坊邻里。傍晚时分,夕阳的余晖洒在石板路上,为这个充满生活气息的小巷染上一层温暖的金色。街角的老榕树下,常常有卖唱的艺人在这里驻足,用沧桑的嗓音讲述着这座城市的故事。偶尔,还能看到三三两两的游客举着相机,试图捕捉这里独特的市井风情。这条看似普通的小巷,承载着太多市民的回忆和岁月的痕迹,它就像是这座城市的一个缩影,悄悄地诉说着曾经的故事。""",
|
||||
}
|
||||
|
|
|
|||
File diff suppressed because one or more lines are too long
Loading…
Add table
Reference in a new issue