feat: Eliminate the use of max_chunk_tokens and use a unified max_chunk_size instead [cog-1381] (#626)

<!-- .github/pull_request_template.md -->

## Description
<!-- Provide a clear description of the changes in this PR -->

## DCO Affirmation
I affirm that all code in every commit of this pull request conforms to
the terms of the Topoteretes Developer Certificate of Origin


<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
## Summary by CodeRabbit

- **Refactor**
- Simplified text processing by unifying multiple size-related
parameters into a single metric across chunking and extraction
functionalities.
- Streamlined logic for text segmentation by removing redundant
calculations and checks, resulting in a more consistent chunk management
process.
- **Chores**
  - Removed the `modal` package as a dependency.
- **Documentation**
- Updated the README.md to include a new demo video link and clarified
default environment variable settings.
- Enhanced the CONTRIBUTING.md to improve clarity and engagement for
potential contributors.
- **Bug Fixes**
- Improved handling of sentence-ending punctuation in text processing to
include additional characters.
- **Version Update**
  - Updated project version to 0.1.33 in the pyproject.toml file.
<!-- end of auto-generated comment: release notes by coderabbit.ai -->
This commit is contained in:
alekszievr 2025-03-12 14:03:41 +01:00 committed by GitHub
parent b78d9f196f
commit c1f7b667d1
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
27 changed files with 285 additions and 219 deletions

View file

@ -113,7 +113,7 @@ def generate_dataset_name(dataset_name: str) -> str:
async def get_default_tasks( # TODO: Find out a better way to do this (Boris's comment)
user: User = None, graph_model: BaseModel = KnowledgeGraph, chunk_size=1024, chunker=TextChunker
user: User = None, graph_model: BaseModel = KnowledgeGraph, chunker=TextChunker
) -> list[Task]:
if user is None:
user = await get_default_user()
@ -125,9 +125,8 @@ async def get_default_tasks( # TODO: Find out a better way to do this (Boris's
Task(check_permissions_on_documents, user=user, permissions=["write"]),
Task(
extract_chunks_from_documents,
max_chunk_tokens=get_max_chunk_tokens(),
max_chunk_size=get_max_chunk_tokens(),
chunker=chunker,
chunk_size=chunk_size,
), # Extract text chunks based on the document type.
Task(
extract_graph_from_data, graph_model=graph_model, task_config={"batch_size": 10}

View file

@ -55,5 +55,5 @@ class CorpusBuilderExecutor:
await cognee.add(self.raw_corpus)
tasks = await self.task_getter(chunk_size=chunk_size, chunker=TextChunker)
tasks = await self.task_getter(chunker=TextChunker)
await cognee.cognify(tasks=tasks)

View file

@ -48,7 +48,6 @@ async def run_corpus_builder(params: dict, chunk_size=1024, chunker=TextChunker)
)
questions = await corpus_builder.build_corpus(
limit=params.get("number_of_samples_in_corpus"),
chunk_size=chunk_size,
chunker=chunker,
load_golden_context=params.get("evaluating_contexts"),
)

View file

@ -1,13 +1,12 @@
class Chunker:
def __init__(self, document, get_text: callable, max_chunk_tokens: int, chunk_size: int = 1024):
def __init__(self, document, get_text: callable, max_chunk_size: int):
self.chunk_index = 0
self.chunk_size = 0
self.token_count = 0
self.document = document
self.max_chunk_size = chunk_size
self.max_chunk_size = max_chunk_size
self.get_text = get_text
self.max_chunk_tokens = max_chunk_tokens
def read(self):
raise NotImplementedError

View file

@ -9,33 +9,23 @@ logger = logging.getLogger(__name__)
class TextChunker(Chunker):
def check_word_count_and_token_count(self, word_count_before, token_count_before, chunk_data):
word_count_fits = word_count_before + chunk_data["word_count"] <= self.max_chunk_size
token_count_fits = token_count_before + chunk_data["token_count"] <= self.max_chunk_tokens
return word_count_fits and token_count_fits
def read(self):
paragraph_chunks = []
for content_text in self.get_text():
for chunk_data in chunk_by_paragraph(
content_text,
self.max_chunk_tokens,
self.max_chunk_size,
batch_paragraphs=True,
):
if self.check_word_count_and_token_count(
self.chunk_size, self.token_count, chunk_data
):
if self.chunk_size + chunk_data["chunk_size"] <= self.max_chunk_size:
paragraph_chunks.append(chunk_data)
self.chunk_size += chunk_data["word_count"]
self.token_count += chunk_data["token_count"]
self.chunk_size += chunk_data["chunk_size"]
else:
if len(paragraph_chunks) == 0:
yield DocumentChunk(
id=chunk_data["chunk_id"],
text=chunk_data["text"],
word_count=chunk_data["word_count"],
token_count=chunk_data["token_count"],
chunk_size=chunk_data["chunk_size"],
is_part_of=self.document,
chunk_index=self.chunk_index,
cut_type=chunk_data["cut_type"],
@ -54,8 +44,7 @@ class TextChunker(Chunker):
NAMESPACE_OID, f"{str(self.document.id)}-{self.chunk_index}"
),
text=chunk_text,
word_count=self.chunk_size,
token_count=self.token_count,
chunk_size=self.chunk_size,
is_part_of=self.document,
chunk_index=self.chunk_index,
cut_type=paragraph_chunks[len(paragraph_chunks) - 1]["cut_type"],
@ -68,8 +57,7 @@ class TextChunker(Chunker):
logger.error(e)
raise e
paragraph_chunks = [chunk_data]
self.chunk_size = chunk_data["word_count"]
self.token_count = chunk_data["token_count"]
self.chunk_size = chunk_data["chunk_size"]
self.chunk_index += 1
@ -78,8 +66,7 @@ class TextChunker(Chunker):
yield DocumentChunk(
id=uuid5(NAMESPACE_OID, f"{str(self.document.id)}-{self.chunk_index}"),
text=" ".join(chunk["text"] for chunk in paragraph_chunks),
word_count=self.chunk_size,
token_count=self.token_count,
chunk_size=self.chunk_size,
is_part_of=self.document,
chunk_index=self.chunk_index,
cut_type=paragraph_chunks[len(paragraph_chunks) - 1]["cut_type"],

View file

@ -7,8 +7,7 @@ from cognee.modules.engine.models import Entity
class DocumentChunk(DataPoint):
text: str
word_count: int
token_count: int
chunk_size: int
chunk_index: int
cut_type: str
is_part_of: Document

View file

@ -11,13 +11,11 @@ class AudioDocument(Document):
result = get_llm_client().create_transcript(self.raw_data_location)
return result.text
def read(self, chunk_size: int, chunker_cls: Chunker, max_chunk_tokens: int):
def read(self, chunker_cls: Chunker, max_chunk_size: int):
# Transcribe the audio file
text = self.create_transcript()
chunker = chunker_cls(
self, chunk_size=chunk_size, get_text=lambda: [text], max_chunk_tokens=max_chunk_tokens
)
chunker = chunker_cls(self, max_chunk_size=max_chunk_size, get_text=lambda: [text])
yield from chunker.read()

View file

@ -10,7 +10,5 @@ class Document(DataPoint):
mime_type: str
metadata: dict = {"index_fields": ["name"]}
def read(
self, chunk_size: int, chunker_cls: Chunker, max_chunk_tokens: Optional[int] = None
) -> str:
def read(self, chunker_cls: Chunker, max_chunk_size: int) -> str:
pass

View file

@ -11,12 +11,10 @@ class ImageDocument(Document):
result = get_llm_client().transcribe_image(self.raw_data_location)
return result.choices[0].message.content
def read(self, chunk_size: int, chunker_cls: Chunker, max_chunk_tokens: int):
def read(self, chunker_cls: Chunker, max_chunk_size: int):
# Transcribe the image file
text = self.transcribe_image()
chunker = chunker_cls(
self, chunk_size=chunk_size, get_text=lambda: [text], max_chunk_tokens=max_chunk_tokens
)
chunker = chunker_cls(self, get_text=lambda: [text], max_chunk_size=max_chunk_size)
yield from chunker.read()

View file

@ -7,7 +7,7 @@ from .Document import Document
class PdfDocument(Document):
type: str = "pdf"
def read(self, chunk_size: int, chunker_cls: Chunker, max_chunk_tokens: int):
def read(self, chunker_cls: Chunker, max_chunk_size: int):
file = PdfReader(self.raw_data_location)
def get_text():
@ -15,9 +15,7 @@ class PdfDocument(Document):
page_text = page.extract_text()
yield page_text
chunker = chunker_cls(
self, chunk_size=chunk_size, get_text=get_text, max_chunk_tokens=max_chunk_tokens
)
chunker = chunker_cls(self, get_text=get_text, max_chunk_size=max_chunk_size)
yield from chunker.read()

View file

@ -5,7 +5,7 @@ from cognee.modules.chunking.Chunker import Chunker
class TextDocument(Document):
type: str = "text"
def read(self, chunk_size: int, chunker_cls: Chunker, max_chunk_tokens: int):
def read(self, chunker_cls: Chunker, max_chunk_size: int):
def get_text():
with open(self.raw_data_location, mode="r", encoding="utf-8") as file:
while True:
@ -16,8 +16,6 @@ class TextDocument(Document):
yield text
chunker = chunker_cls(
self, chunk_size=chunk_size, get_text=get_text, max_chunk_tokens=max_chunk_tokens
)
chunker = chunker_cls(self, max_chunk_size=max_chunk_size, get_text=get_text)
yield from chunker.read()

View file

@ -9,7 +9,7 @@ from .Document import Document
class UnstructuredDocument(Document):
type: str = "unstructured"
def read(self, chunk_size: int, chunker_cls: Chunker, max_chunk_tokens: int) -> str:
def read(self, chunker_cls: Chunker, max_chunk_size: int) -> str:
def get_text():
try:
from unstructured.partition.auto import partition
@ -28,8 +28,6 @@ class UnstructuredDocument(Document):
yield text
chunker = chunker_cls(
self, chunk_size=chunk_size, get_text=get_text, max_chunk_tokens=max_chunk_tokens
)
chunker = chunker_cls(self, get_text=get_text, max_chunk_size=max_chunk_size)
yield from chunker.read()

View file

@ -1,15 +1,12 @@
from typing import Any, Dict, Iterator
from uuid import NAMESPACE_OID, uuid5
from cognee.infrastructure.databases.vector import get_vector_engine
from .chunk_by_sentence import chunk_by_sentence
def chunk_by_paragraph(
data: str,
max_chunk_tokens,
paragraph_length: int = 1024,
max_chunk_size,
batch_paragraphs: bool = True,
) -> Iterator[Dict[str, Any]]:
"""
@ -23,28 +20,19 @@ def chunk_by_paragraph(
- Remaining text at the end of the input will be yielded as a final chunk.
"""
current_chunk = ""
current_word_count = 0
chunk_index = 0
paragraph_ids = []
last_cut_type = None
current_token_count = 0
current_chunk_size = 0
for paragraph_id, sentence, word_count, end_type in chunk_by_sentence(
data, maximum_length=paragraph_length
for paragraph_id, sentence, sentence_size, end_type in chunk_by_sentence(
data, maximum_size=max_chunk_size
):
# Check if this sentence would exceed length limit
embedding_engine = get_vector_engine().embedding_engine
token_count = embedding_engine.tokenizer.count_tokens(sentence)
if current_word_count > 0 and (
current_word_count + word_count > paragraph_length
or current_token_count + token_count > max_chunk_tokens
):
if current_chunk_size > 0 and (current_chunk_size + sentence_size > max_chunk_size):
# Yield current chunk
chunk_dict = {
"text": current_chunk,
"word_count": current_word_count,
"token_count": current_token_count,
"chunk_size": current_chunk_size,
"chunk_id": uuid5(NAMESPACE_OID, current_chunk),
"paragraph_ids": paragraph_ids,
"chunk_index": chunk_index,
@ -56,22 +44,19 @@ def chunk_by_paragraph(
# Start new chunk with current sentence
paragraph_ids = []
current_chunk = ""
current_word_count = 0
current_token_count = 0
current_chunk_size = 0
chunk_index += 1
paragraph_ids.append(paragraph_id)
current_chunk += sentence
current_word_count += word_count
current_token_count += token_count
current_chunk_size += sentence_size
# Handle end of paragraph
if end_type in ("paragraph_end", "sentence_cut") and not batch_paragraphs:
# For non-batch mode, yield each paragraph separately
chunk_dict = {
"text": current_chunk,
"word_count": current_word_count,
"token_count": current_token_count,
"chunk_size": current_chunk_size,
"paragraph_ids": paragraph_ids,
"chunk_id": uuid5(NAMESPACE_OID, current_chunk),
"chunk_index": chunk_index,
@ -80,8 +65,7 @@ def chunk_by_paragraph(
yield chunk_dict
paragraph_ids = []
current_chunk = ""
current_word_count = 0
current_token_count = 0
current_chunk_size = 0
chunk_index += 1
last_cut_type = end_type
@ -90,8 +74,7 @@ def chunk_by_paragraph(
if current_chunk:
chunk_dict = {
"text": current_chunk,
"word_count": current_word_count,
"token_count": current_token_count,
"chunk_size": current_chunk_size,
"chunk_id": uuid5(NAMESPACE_OID, current_chunk),
"paragraph_ids": paragraph_ids,
"chunk_index": chunk_index,

View file

@ -1,10 +1,19 @@
from uuid import uuid4, UUID
from typing import Optional, Iterator, Tuple
from .chunk_by_word import chunk_by_word
from cognee.infrastructure.databases.vector.embeddings import get_embedding_engine
def get_word_size(word: str) -> int:
embedding_engine = get_embedding_engine()
if embedding_engine.tokenizer:
return embedding_engine.tokenizer.count_tokens(word)
else:
return 1
def chunk_by_sentence(
data: str, maximum_length: Optional[int] = None
data: str, maximum_size: Optional[int] = None
) -> Iterator[Tuple[UUID, str, int, Optional[str]]]:
"""
Splits the input text into sentences based on word-level processing, with optional sentence length constraints.
@ -16,7 +25,7 @@ def chunk_by_sentence(
"""
sentence = ""
paragraph_id = uuid4()
word_count = 0
sentence_size = 0
section_end = False
word_type_state = None
@ -25,8 +34,7 @@ def chunk_by_sentence(
# and words with the same characteristics connect it to a preceding
# word with word_type 'paragraph_end' or 'sentence_end'
for word, word_type in chunk_by_word(data):
sentence += word
word_count += 1
word_size = get_word_size(word)
if word_type in ["paragraph_end", "sentence_end"]:
word_type_state = word_type
@ -36,19 +44,31 @@ def chunk_by_sentence(
word_type_state = word_type
break
if word_type in ["paragraph_end", "sentence_end"] or (
maximum_length and (word_count == maximum_length)
):
yield (paragraph_id, sentence, word_count, word_type_state)
sentence = ""
word_count = 0
if maximum_size and (sentence_size + word_size > maximum_size):
yield (paragraph_id, sentence, sentence_size, word_type_state)
sentence = word
sentence_size = word_size
elif word_type in ["paragraph_end", "sentence_end"]:
sentence += word
sentence_size += word_size
paragraph_id = uuid4() if word_type == "paragraph_end" else paragraph_id
yield (paragraph_id, sentence, sentence_size, word_type_state)
sentence = ""
sentence_size = 0
else:
sentence += word
sentence_size += word_size
if len(sentence) > 0:
if maximum_size and sentence_size > maximum_size:
raise ValueError(f"Input word {word} longer than chunking size {maximum_size}.")
section_end = "sentence_cut" if word_type_state == "word" else word_type_state
yield (
paragraph_id,
sentence,
word_count,
sentence_size,
section_end,
)

View file

@ -2,7 +2,7 @@ import re
from typing import Iterator, Tuple
SENTENCE_ENDINGS = r"[.;!?…]"
SENTENCE_ENDINGS = r"[.;!?…。!?]"
PARAGRAPH_ENDINGS = r"[\n\r]"

View file

@ -26,8 +26,7 @@ async def update_document_token_count(document_id: UUID, token_count: int) -> No
async def extract_chunks_from_documents(
documents: list[Document],
max_chunk_tokens: int,
chunk_size: int = 1024,
max_chunk_size: int,
chunker: Chunker = TextChunker,
) -> AsyncGenerator:
"""
@ -39,10 +38,9 @@ async def extract_chunks_from_documents(
"""
for document in documents:
document_token_count = 0
for document_chunk in document.read(
chunk_size=chunk_size, chunker_cls=chunker, max_chunk_tokens=max_chunk_tokens
):
document_token_count += document_chunk.token_count
for document_chunk in document.read(max_chunk_size=max_chunk_size, chunker_cls=chunker):
document_token_count += document_chunk.chunk_size
yield document_chunk
await update_document_token_count(document.id, document_token_count)
# todo rita

View file

@ -2,6 +2,17 @@ import uuid
from unittest.mock import patch
from cognee.modules.chunking.TextChunker import TextChunker
from cognee.modules.data.processing.document_types.AudioDocument import AudioDocument
import sys
chunk_by_sentence_module = sys.modules.get("cognee.tasks.chunks.chunk_by_sentence")
def mock_get_embedding_engine():
class MockEngine:
tokenizer = None
return MockEngine()
GROUND_TRUTH = [
{"word_count": 57, "len_text": 353, "cut_type": "sentence_end"},
@ -24,7 +35,10 @@ TEST_TEXT = """
"The feature ships, Sarah. That's final.\""""
def test_AudioDocument():
@patch.object(
chunk_by_sentence_module, "get_embedding_engine", side_effect=mock_get_embedding_engine
)
def test_AudioDocument(mock_engine):
document = AudioDocument(
id=uuid.uuid4(),
name="audio-dummy-test",
@ -35,10 +49,10 @@ def test_AudioDocument():
with patch.object(AudioDocument, "create_transcript", return_value=TEST_TEXT):
for ground_truth, paragraph_data in zip(
GROUND_TRUTH,
document.read(chunk_size=64, chunker_cls=TextChunker, max_chunk_tokens=512),
document.read(chunker_cls=TextChunker, max_chunk_size=64),
):
assert ground_truth["word_count"] == paragraph_data.word_count, (
f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }'
assert ground_truth["word_count"] == paragraph_data.chunk_size, (
f'{ground_truth["word_count"] = } != {paragraph_data.chunk_size = }'
)
assert ground_truth["len_text"] == len(paragraph_data.text), (
f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }'

View file

@ -2,6 +2,11 @@ import uuid
from unittest.mock import patch
from cognee.modules.chunking.TextChunker import TextChunker
from cognee.modules.data.processing.document_types.ImageDocument import ImageDocument
from cognee.tests.integration.documents.AudioDocument_test import mock_get_embedding_engine
import sys
chunk_by_sentence_module = sys.modules.get("cognee.tasks.chunks.chunk_by_sentence")
GROUND_TRUTH = [
{"word_count": 51, "len_text": 298, "cut_type": "sentence_end"},
@ -13,7 +18,10 @@ TEST_TEXT = """A dramatic confrontation unfolds as a red fox and river otter eng
The commotion has attracted an audience: a murder of crows has gathered in the low branches, their harsh calls adding to the chaos as they hop excitedly from limb to limb. One particularly bold crow dive-bombs the wrestling pair, causing both animals to momentarily freeze mid-tussle, creating a perfect snapshot of suspended actionthe fox's fur dripping wet, the otter's body coiled like a spring, and the crow's wings spread wide against the golden morning light."""
def test_ImageDocument():
@patch.object(
chunk_by_sentence_module, "get_embedding_engine", side_effect=mock_get_embedding_engine
)
def test_ImageDocument(mock_engine):
document = ImageDocument(
id=uuid.uuid4(),
name="image-dummy-test",
@ -24,10 +32,10 @@ def test_ImageDocument():
with patch.object(ImageDocument, "transcribe_image", return_value=TEST_TEXT):
for ground_truth, paragraph_data in zip(
GROUND_TRUTH,
document.read(chunk_size=64, chunker_cls=TextChunker, max_chunk_tokens=512),
document.read(chunker_cls=TextChunker, max_chunk_size=64),
):
assert ground_truth["word_count"] == paragraph_data.word_count, (
f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }'
assert ground_truth["word_count"] == paragraph_data.chunk_size, (
f'{ground_truth["word_count"] = } != {paragraph_data.chunk_size = }'
)
assert ground_truth["len_text"] == len(paragraph_data.text), (
f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }'

View file

@ -2,6 +2,12 @@ import os
import uuid
from cognee.modules.chunking.TextChunker import TextChunker
from cognee.modules.data.processing.document_types.PdfDocument import PdfDocument
from cognee.tests.integration.documents.AudioDocument_test import mock_get_embedding_engine
from unittest.mock import patch
import sys
chunk_by_sentence_module = sys.modules.get("cognee.tasks.chunks.chunk_by_sentence")
GROUND_TRUTH = [
{"word_count": 879, "len_text": 5607, "cut_type": "sentence_end"},
@ -9,7 +15,10 @@ GROUND_TRUTH = [
]
def test_PdfDocument():
@patch.object(
chunk_by_sentence_module, "get_embedding_engine", side_effect=mock_get_embedding_engine
)
def test_PdfDocument(mock_engine):
test_file_path = os.path.join(
os.sep,
*(os.path.dirname(__file__).split(os.sep)[:-2]),
@ -25,10 +34,10 @@ def test_PdfDocument():
)
for ground_truth, paragraph_data in zip(
GROUND_TRUTH, document.read(chunk_size=1024, chunker_cls=TextChunker, max_chunk_tokens=2048)
GROUND_TRUTH, document.read(chunker_cls=TextChunker, max_chunk_size=1024)
):
assert ground_truth["word_count"] == paragraph_data.word_count, (
f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }'
assert ground_truth["word_count"] == paragraph_data.chunk_size, (
f'{ground_truth["word_count"] = } != {paragraph_data.chunk_size = }'
)
assert ground_truth["len_text"] == len(paragraph_data.text), (
f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }'

View file

@ -4,6 +4,12 @@ import uuid
import pytest
from cognee.modules.chunking.TextChunker import TextChunker
from cognee.modules.data.processing.document_types.TextDocument import TextDocument
from unittest.mock import patch
from cognee.tests.integration.documents.AudioDocument_test import mock_get_embedding_engine
import sys
chunk_by_sentence_module = sys.modules.get("cognee.tasks.chunks.chunk_by_sentence")
GROUND_TRUTH = {
"code.txt": [
@ -21,7 +27,10 @@ GROUND_TRUTH = {
"input_file,chunk_size",
[("code.txt", 256), ("Natural_language_processing.txt", 128)],
)
def test_TextDocument(input_file, chunk_size):
@patch.object(
chunk_by_sentence_module, "get_embedding_engine", side_effect=mock_get_embedding_engine
)
def test_TextDocument(mock_engine, input_file, chunk_size):
test_file_path = os.path.join(
os.sep,
*(os.path.dirname(__file__).split(os.sep)[:-2]),
@ -38,10 +47,10 @@ def test_TextDocument(input_file, chunk_size):
for ground_truth, paragraph_data in zip(
GROUND_TRUTH[input_file],
document.read(chunk_size=chunk_size, chunker_cls=TextChunker, max_chunk_tokens=1024),
document.read(chunker_cls=TextChunker, max_chunk_size=chunk_size),
):
assert ground_truth["word_count"] == paragraph_data.word_count, (
f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }'
assert ground_truth["word_count"] == paragraph_data.chunk_size, (
f'{ground_truth["word_count"] = } != {paragraph_data.chunk_size = }'
)
assert ground_truth["len_text"] == len(paragraph_data.text), (
f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }'

View file

@ -1,10 +1,18 @@
import os
import uuid
from unittest.mock import patch
from cognee.modules.chunking.TextChunker import TextChunker
from cognee.modules.data.processing.document_types.UnstructuredDocument import UnstructuredDocument
from cognee.tests.integration.documents.AudioDocument_test import mock_get_embedding_engine
import sys
chunk_by_sentence_module = sys.modules.get("cognee.tasks.chunks.chunk_by_sentence")
def test_UnstructuredDocument():
@patch.object(
chunk_by_sentence_module, "get_embedding_engine", side_effect=mock_get_embedding_engine
)
def test_UnstructuredDocument(mock_engine):
# Define file paths of test data
pptx_file_path = os.path.join(
os.sep,
@ -68,30 +76,24 @@ def test_UnstructuredDocument():
)
# Test PPTX
for paragraph_data in pptx_document.read(
chunk_size=1024, chunker_cls=TextChunker, max_chunk_tokens=1024
):
assert 19 == paragraph_data.word_count, f" 19 != {paragraph_data.word_count = }"
for paragraph_data in pptx_document.read(chunker_cls=TextChunker, max_chunk_size=1024):
assert 19 == paragraph_data.chunk_size, f" 19 != {paragraph_data.chunk_size = }"
assert 104 == len(paragraph_data.text), f" 104 != {len(paragraph_data.text) = }"
assert "sentence_cut" == paragraph_data.cut_type, (
f" sentence_cut != {paragraph_data.cut_type = }"
)
# Test DOCX
for paragraph_data in docx_document.read(
chunk_size=1024, chunker_cls=TextChunker, max_chunk_tokens=1024
):
assert 16 == paragraph_data.word_count, f" 16 != {paragraph_data.word_count = }"
for paragraph_data in docx_document.read(chunker_cls=TextChunker, max_chunk_size=1024):
assert 16 == paragraph_data.chunk_size, f" 16 != {paragraph_data.chunk_size = }"
assert 145 == len(paragraph_data.text), f" 145 != {len(paragraph_data.text) = }"
assert "sentence_end" == paragraph_data.cut_type, (
f" sentence_end != {paragraph_data.cut_type = }"
)
# TEST CSV
for paragraph_data in csv_document.read(
chunk_size=1024, chunker_cls=TextChunker, max_chunk_tokens=1024
):
assert 15 == paragraph_data.word_count, f" 15 != {paragraph_data.word_count = }"
for paragraph_data in csv_document.read(chunker_cls=TextChunker, max_chunk_size=1024):
assert 15 == paragraph_data.chunk_size, f" 15 != {paragraph_data.chunk_size = }"
assert "A A A A A A A A A,A A A A A A,A A" == paragraph_data.text, (
f"Read text doesn't match expected text: {paragraph_data.text}"
)
@ -100,10 +102,8 @@ def test_UnstructuredDocument():
)
# Test XLSX
for paragraph_data in xlsx_document.read(
chunk_size=1024, chunker_cls=TextChunker, max_chunk_tokens=1024
):
assert 36 == paragraph_data.word_count, f" 36 != {paragraph_data.word_count = }"
for paragraph_data in xlsx_document.read(chunker_cls=TextChunker, max_chunk_size=1024):
assert 36 == paragraph_data.chunk_size, f" 36 != {paragraph_data.chunk_size = }"
assert 171 == len(paragraph_data.text), f" 171 != {len(paragraph_data.text) = }"
assert "sentence_cut" == paragraph_data.cut_type, (
f" sentence_cut != {paragraph_data.cut_type = }"

View file

@ -3,29 +3,26 @@ from itertools import product
import numpy as np
import pytest
from cognee.tasks.chunks import chunk_by_paragraph, chunk_by_word
from cognee.tests.unit.processing.chunks.test_input import INPUT_TEXTS
from cognee.infrastructure.databases.vector.embeddings import get_embedding_engine
from cognee.tasks.chunks import chunk_by_paragraph
paragraph_lengths = [64, 256, 1024]
batch_paragraphs_vals = [True, False]
max_chunk_tokens_vals = [512, 1024, 4096]
max_chunk_size_vals = [512, 1024, 4096]
@pytest.mark.parametrize(
"input_text,max_chunk_tokens,paragraph_length,batch_paragraphs",
"input_text,max_chunk_size,batch_paragraphs",
list(
product(
list(INPUT_TEXTS.values()),
max_chunk_tokens_vals,
paragraph_lengths,
max_chunk_size_vals,
batch_paragraphs_vals,
)
),
)
def test_chunk_by_paragraph_isomorphism(
input_text, max_chunk_tokens, paragraph_length, batch_paragraphs
):
chunks = chunk_by_paragraph(input_text, max_chunk_tokens, paragraph_length, batch_paragraphs)
def test_chunk_by_paragraph_isomorphism(input_text, max_chunk_size, batch_paragraphs):
chunks = chunk_by_paragraph(input_text, max_chunk_size, batch_paragraphs)
reconstructed_text = "".join([chunk["text"] for chunk in chunks])
assert reconstructed_text == input_text, (
f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }"
@ -33,52 +30,49 @@ def test_chunk_by_paragraph_isomorphism(
@pytest.mark.parametrize(
"input_text,max_chunk_tokens,paragraph_length,batch_paragraphs",
"input_text,max_chunk_size, batch_paragraphs",
list(
product(
list(INPUT_TEXTS.values()),
max_chunk_tokens_vals,
paragraph_lengths,
max_chunk_size_vals,
batch_paragraphs_vals,
)
),
)
def test_paragraph_chunk_length(input_text, max_chunk_tokens, paragraph_length, batch_paragraphs):
def test_paragraph_chunk_length(input_text, max_chunk_size, batch_paragraphs):
chunks = list(
chunk_by_paragraph(
data=input_text,
max_chunk_tokens=max_chunk_tokens,
paragraph_length=paragraph_length,
max_chunk_size=max_chunk_size,
batch_paragraphs=batch_paragraphs,
)
)
embedding_engine = get_embedding_engine()
chunk_lengths = np.array([len(list(chunk_by_word(chunk["text"]))) for chunk in chunks])
chunk_lengths = np.array(
[embedding_engine.tokenizer.count_tokens(chunk["text"]) for chunk in chunks]
)
larger_chunks = chunk_lengths[chunk_lengths > paragraph_length]
assert np.all(chunk_lengths <= paragraph_length), (
f"{paragraph_length = }: {larger_chunks} are too large"
larger_chunks = chunk_lengths[chunk_lengths > max_chunk_size]
assert np.all(chunk_lengths <= max_chunk_size), (
f"{max_chunk_size = }: {larger_chunks} are too large"
)
@pytest.mark.parametrize(
"input_text,max_chunk_tokens,paragraph_length,batch_paragraphs",
"input_text,max_chunk_size,batch_paragraphs",
list(
product(
list(INPUT_TEXTS.values()),
max_chunk_tokens_vals,
paragraph_lengths,
max_chunk_size_vals,
batch_paragraphs_vals,
)
),
)
def test_chunk_by_paragraph_chunk_numbering(
input_text, max_chunk_tokens, paragraph_length, batch_paragraphs
):
def test_chunk_by_paragraph_chunk_numbering(input_text, max_chunk_size, batch_paragraphs):
chunks = chunk_by_paragraph(
data=input_text,
max_chunk_tokens=max_chunk_tokens,
paragraph_length=paragraph_length,
max_chunk_size=max_chunk_size,
batch_paragraphs=batch_paragraphs,
)
chunk_indices = np.array([chunk["chunk_index"] for chunk in chunks])

View file

@ -1,37 +1,49 @@
from unittest.mock import patch
from cognee.tasks.chunks import chunk_by_paragraph
import sys
chunk_by_sentence_module = sys.modules.get("cognee.tasks.chunks.chunk_by_sentence")
def mock_get_embedding_engine():
class MockEngine:
tokenizer = None
return MockEngine()
GROUND_TRUTH = {
"whole_text": [
{
"text": "This is example text. It contains multiple sentences.",
"word_count": 8,
"chunk_size": 8,
"cut_type": "paragraph_end",
},
{
"text": "\nThis is a second paragraph. First two paragraphs are whole.",
"word_count": 10,
"chunk_size": 10,
"cut_type": "paragraph_end",
},
{
"text": "\nThird paragraph is a bit longer and is finished with a dot.",
"word_count": 12,
"chunk_size": 12,
"cut_type": "sentence_end",
},
],
"cut_text": [
{
"text": "This is example text. It contains multiple sentences.",
"word_count": 8,
"chunk_size": 8,
"cut_type": "paragraph_end",
},
{
"text": "\nThis is a second paragraph. First two paragraphs are whole.",
"word_count": 10,
"chunk_size": 10,
"cut_type": "paragraph_end",
},
{
"text": "\nThird paragraph is cut and is missing the dot at the end",
"word_count": 12,
"chunk_size": 12,
"cut_type": "sentence_cut",
},
],
@ -47,17 +59,18 @@ Third paragraph is cut and is missing the dot at the end""",
}
def run_chunking_test(test_text, expected_chunks):
@patch.object(
chunk_by_sentence_module, "get_embedding_engine", side_effect=mock_get_embedding_engine
)
def run_chunking_test(test_text, expected_chunks, mock_engine):
chunks = []
for chunk_data in chunk_by_paragraph(
data=test_text, paragraph_length=12, batch_paragraphs=False, max_chunk_tokens=512
):
for chunk_data in chunk_by_paragraph(data=test_text, batch_paragraphs=False, max_chunk_size=12):
chunks.append(chunk_data)
assert len(chunks) == 3
for expected_chunks_item, chunk in zip(expected_chunks, chunks):
for key in ["text", "word_count", "cut_type"]:
for key in ["text", "chunk_size", "cut_type"]:
assert chunk[key] == expected_chunks_item[key], (
f"{key = }: {chunk[key] = } != {expected_chunks_item[key] = }"
)

View file

@ -3,10 +3,11 @@ from itertools import product
import numpy as np
import pytest
from cognee.tasks.chunks import chunk_by_sentence, chunk_by_word
from cognee.tests.unit.processing.chunks.test_input import INPUT_TEXTS
from cognee.tasks.chunks import chunk_by_sentence
from cognee.tests.unit.processing.chunks.test_input import INPUT_TEXTS_LONGWORDS, INPUT_TEXTS
from cognee.infrastructure.databases.vector.embeddings import get_embedding_engine
maximum_length_vals = [None, 8, 64]
maximum_length_vals = [None, 16, 64]
@pytest.mark.parametrize(
@ -33,9 +34,26 @@ def test_chunk_by_sentence_isomorphism(input_text, maximum_length):
def test_paragraph_chunk_length(input_text, maximum_length):
chunks = list(chunk_by_sentence(input_text, maximum_length))
chunk_lengths = np.array([len(list(chunk_by_word(chunk[1]))) for chunk in chunks])
embedding_engine = get_embedding_engine()
chunk_lengths = np.array(
[embedding_engine.tokenizer.count_tokens(chunk[1]) for chunk in chunks]
)
larger_chunks = chunk_lengths[chunk_lengths > maximum_length]
assert np.all(chunk_lengths <= maximum_length), (
f"{maximum_length = }: {larger_chunks} are too large"
)
@pytest.mark.parametrize(
"input_text,maximum_length",
list(
product(
list(INPUT_TEXTS_LONGWORDS.values()),
[val for val in maximum_length_vals if val is not None],
)
),
)
def test_paragraph_chunk_long_input(input_text, maximum_length):
with pytest.raises(ValueError):
list(chunk_by_sentence(input_text, maximum_length))

View file

@ -2,7 +2,7 @@ import numpy as np
import pytest
from cognee.tasks.chunks import chunk_by_word
from cognee.tests.unit.processing.chunks.test_input import INPUT_TEXTS
from cognee.tests.unit.processing.chunks.test_input import INPUT_TEXTS, INPUT_TEXTS_LONGWORDS
@pytest.mark.parametrize(
@ -11,7 +11,7 @@ from cognee.tests.unit.processing.chunks.test_input import INPUT_TEXTS
INPUT_TEXTS["english_text"],
INPUT_TEXTS["english_lists"],
INPUT_TEXTS["python_code"],
INPUT_TEXTS["chinese_text"],
INPUT_TEXTS_LONGWORDS["chinese_text"],
],
)
def test_chunk_by_word_isomorphism(input_text):
@ -28,7 +28,7 @@ def test_chunk_by_word_isomorphism(input_text):
INPUT_TEXTS["english_text"],
INPUT_TEXTS["english_lists"],
INPUT_TEXTS["python_code"],
INPUT_TEXTS["chinese_text"],
INPUT_TEXTS_LONGWORDS["chinese_text"],
],
)
def test_chunk_by_word_splits(input_text):

View file

@ -179,7 +179,6 @@ def pad(
mode: _ModeFunc,
**kwargs: Any,
) -> NDArray[Any]: ...""",
"chinese_text": """在这个繁华的城市里,藏着一个古老的小巷,名叫杨柳巷。巷子两旁的青石板路已经被无数行人的脚步磨得发亮,斑驳的老墙上爬满了常青藤,给这个充满历史气息的小巷增添了一抹生机。每天清晨,巷子里都会飘出阵阵香气,那是张婆婆家的早点铺子散发出的包子和豆浆的味道。老店门前经常排着长队,有步履匆匆的上班族,也有悠闲散步的老人。巷子深处有一家传统的茶馆,古色古香的木桌椅上总是坐满了品茶聊天的街坊邻里。傍晚时分,夕阳的余晖洒在石板路上,为这个充满生活气息的小巷染上一层温暖的金色。街角的老榕树下,常常有卖唱的艺人在这里驻足,用沧桑的嗓音讲述着这座城市的故事。偶尔,还能看到三三两两的游客举着相机,试图捕捉这里独特的市井风情。这条看似普通的小巷,承载着太多市民的回忆和岁月的痕迹,它就像是这座城市的一个缩影,悄悄地诉说着曾经的故事。""",
"english_text": """O for that warning voice, which he who saw
Th' Apocalyps, heard cry in Heaven aloud,
Then when the Dragon, put to second rout,
@ -282,3 +281,7 @@ For never can true reconcilement grow
Where wounds of deadly hate have peirc'd so deep:
Which would but lead me to a worse relapse [ 100 ]""",
}
INPUT_TEXTS_LONGWORDS = {
"chinese_text": """在这个繁华的城市里,藏着一个古老的小巷,名叫杨柳巷。巷子两旁的青石板路已经被无数行人的脚步磨得发亮,斑驳的老墙上爬满了常青藤,给这个充满历史气息的小巷增添了一抹生机。每天清晨,巷子里都会飘出阵阵香气,那是张婆婆家的早点铺子散发出的包子和豆浆的味道。老店门前经常排着长队,有步履匆匆的上班族,也有悠闲散步的老人。巷子深处有一家传统的茶馆,古色古香的木桌椅上总是坐满了品茶聊天的街坊邻里。傍晚时分,夕阳的余晖洒在石板路上,为这个充满生活气息的小巷染上一层温暖的金色。街角的老榕树下,常常有卖唱的艺人在这里驻足,用沧桑的嗓音讲述着这座城市的故事。偶尔,还能看到三三两两的游客举着相机,试图捕捉这里独特的市井风情。这条看似普通的小巷,承载着太多市民的回忆和岁月的痕迹,它就像是这座城市的一个缩影,悄悄地诉说着曾经的故事。""",
}

File diff suppressed because one or more lines are too long