feat: Eliminate the use of max_chunk_tokens and use a unified max_chunk_size instead [cog-1381] (#626)

## Description  ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin  ## Summary by CodeRabbit - **Refactor** - Simplified text processing by unifying multiple size-related parameters into a single metric across chunking and extraction functionalities. - Streamlined logic for text segmentation by removing redundant calculations and checks, resulting in a more consistent chunk management process. - **Chores** - Removed the `modal` package as a dependency. - **Documentation** - Updated the README.md to include a new demo video link and clarified default environment variable settings. - Enhanced the CONTRIBUTING.md to improve clarity and engagement for potential contributors. - **Bug Fixes** - Improved handling of sentence-ending punctuation in text processing to include additional characters. - **Version Update** - Updated project version to 0.1.33 in the pyproject.toml file.
2025-03-12 14:03:41 +01:00 · 2025-03-12 14:03:41 +01:00 · c1f7b667d1
commit c1f7b667d1
parent b78d9f196f
27 changed files with 285 additions and 219 deletions
--- a/cognee/api/v1/cognify/cognify_v2.py
+++ b/cognee/api/v1/cognify/cognify_v2.py
@ -113,7 +113,7 @@ def generate_dataset_name(dataset_name: str) -> str:


 async def get_default_tasks(  # TODO: Find out a better way to do this (Boris's comment)
-    user: User = None, graph_model: BaseModel = KnowledgeGraph, chunk_size=1024, chunker=TextChunker
+    user: User = None, graph_model: BaseModel = KnowledgeGraph, chunker=TextChunker
 ) -> list[Task]:
    if user is None:
        user = await get_default_user()
@ -125,9 +125,8 @@ async def get_default_tasks(  # TODO: Find out a better way to do this (Boris's
            Task(check_permissions_on_documents, user=user, permissions=["write"]),
            Task(
                extract_chunks_from_documents,
-                max_chunk_tokens=get_max_chunk_tokens(),
+                max_chunk_size=get_max_chunk_tokens(),
                chunker=chunker,
-                chunk_size=chunk_size,
            ),  # Extract text chunks based on the document type.
            Task(
                extract_graph_from_data, graph_model=graph_model, task_config={"batch_size": 10}
--- a/cognee/eval_framework/corpus_builder/corpus_builder_executor.py
+++ b/cognee/eval_framework/corpus_builder/corpus_builder_executor.py
@ -55,5 +55,5 @@ class CorpusBuilderExecutor:

        await cognee.add(self.raw_corpus)

-        tasks = await self.task_getter(chunk_size=chunk_size, chunker=TextChunker)
+        tasks = await self.task_getter(chunker=TextChunker)
        await cognee.cognify(tasks=tasks)
--- a/cognee/eval_framework/corpus_builder/run_corpus_builder.py
+++ b/cognee/eval_framework/corpus_builder/run_corpus_builder.py
@ -48,7 +48,6 @@ async def run_corpus_builder(params: dict, chunk_size=1024, chunker=TextChunker)
        )
        questions = await corpus_builder.build_corpus(
            limit=params.get("number_of_samples_in_corpus"),
-            chunk_size=chunk_size,
            chunker=chunker,
            load_golden_context=params.get("evaluating_contexts"),
        )
--- a/cognee/modules/chunking/Chunker.py
+++ b/cognee/modules/chunking/Chunker.py
@ -1,13 +1,12 @@
 class Chunker:
-    def __init__(self, document, get_text: callable, max_chunk_tokens: int, chunk_size: int = 1024):
+    def __init__(self, document, get_text: callable, max_chunk_size: int):
        self.chunk_index = 0
        self.chunk_size = 0
        self.token_count = 0

        self.document = document
-        self.max_chunk_size = chunk_size
+        self.max_chunk_size = max_chunk_size
        self.get_text = get_text
-        self.max_chunk_tokens = max_chunk_tokens

    def read(self):
        raise NotImplementedError
--- a/cognee/modules/chunking/TextChunker.py
+++ b/cognee/modules/chunking/TextChunker.py
@ -9,33 +9,23 @@ logger = logging.getLogger(__name__)


 class TextChunker(Chunker):
-    def check_word_count_and_token_count(self, word_count_before, token_count_before, chunk_data):
-        word_count_fits = word_count_before + chunk_data["word_count"] <= self.max_chunk_size
-        token_count_fits = token_count_before + chunk_data["token_count"] <= self.max_chunk_tokens
-        return word_count_fits and token_count_fits
-
    def read(self):
        paragraph_chunks = []
        for content_text in self.get_text():
            for chunk_data in chunk_by_paragraph(
                content_text,
-                self.max_chunk_tokens,
                self.max_chunk_size,
                batch_paragraphs=True,
            ):
-                if self.check_word_count_and_token_count(
-                    self.chunk_size, self.token_count, chunk_data
-                ):
+                if self.chunk_size + chunk_data["chunk_size"] <= self.max_chunk_size:
                    paragraph_chunks.append(chunk_data)
-                    self.chunk_size += chunk_data["word_count"]
-                    self.token_count += chunk_data["token_count"]
+                    self.chunk_size += chunk_data["chunk_size"]
                else:
                    if len(paragraph_chunks) == 0:
                        yield DocumentChunk(
                            id=chunk_data["chunk_id"],
                            text=chunk_data["text"],
-                            word_count=chunk_data["word_count"],
-                            token_count=chunk_data["token_count"],
+                            chunk_size=chunk_data["chunk_size"],
                            is_part_of=self.document,
                            chunk_index=self.chunk_index,
                            cut_type=chunk_data["cut_type"],
@ -54,8 +44,7 @@ class TextChunker(Chunker):
                                    NAMESPACE_OID, f"{str(self.document.id)}-{self.chunk_index}"
                                ),
                                text=chunk_text,
-                                word_count=self.chunk_size,
-                                token_count=self.token_count,
+                                chunk_size=self.chunk_size,
                                is_part_of=self.document,
                                chunk_index=self.chunk_index,
                                cut_type=paragraph_chunks[len(paragraph_chunks) - 1]["cut_type"],
@ -68,8 +57,7 @@ class TextChunker(Chunker):
                            logger.error(e)
                            raise e
                        paragraph_chunks = [chunk_data]
-                        self.chunk_size = chunk_data["word_count"]
-                        self.token_count = chunk_data["token_count"]
+                        self.chunk_size = chunk_data["chunk_size"]

                    self.chunk_index += 1

@ -78,8 +66,7 @@ class TextChunker(Chunker):
                yield DocumentChunk(
                    id=uuid5(NAMESPACE_OID, f"{str(self.document.id)}-{self.chunk_index}"),
                    text=" ".join(chunk["text"] for chunk in paragraph_chunks),
-                    word_count=self.chunk_size,
-                    token_count=self.token_count,
+                    chunk_size=self.chunk_size,
                    is_part_of=self.document,
                    chunk_index=self.chunk_index,
                    cut_type=paragraph_chunks[len(paragraph_chunks) - 1]["cut_type"],
--- a/cognee/modules/chunking/models/DocumentChunk.py
+++ b/cognee/modules/chunking/models/DocumentChunk.py
@ -7,8 +7,7 @@ from cognee.modules.engine.models import Entity

 class DocumentChunk(DataPoint):
    text: str
-    word_count: int
-    token_count: int
+    chunk_size: int
    chunk_index: int
    cut_type: str
    is_part_of: Document
--- a/cognee/modules/data/processing/document_types/AudioDocument.py
+++ b/cognee/modules/data/processing/document_types/AudioDocument.py
@ -11,13 +11,11 @@ class AudioDocument(Document):
        result = get_llm_client().create_transcript(self.raw_data_location)
        return result.text

-    def read(self, chunk_size: int, chunker_cls: Chunker, max_chunk_tokens: int):
+    def read(self, chunker_cls: Chunker, max_chunk_size: int):
        # Transcribe the audio file

        text = self.create_transcript()

-        chunker = chunker_cls(
-            self, chunk_size=chunk_size, get_text=lambda: [text], max_chunk_tokens=max_chunk_tokens
-        )
+        chunker = chunker_cls(self, max_chunk_size=max_chunk_size, get_text=lambda: [text])

        yield from chunker.read()
--- a/cognee/modules/data/processing/document_types/Document.py
+++ b/cognee/modules/data/processing/document_types/Document.py
@ -10,7 +10,5 @@ class Document(DataPoint):
    mime_type: str
    metadata: dict = {"index_fields": ["name"]}

-    def read(
-        self, chunk_size: int, chunker_cls: Chunker, max_chunk_tokens: Optional[int] = None
-    ) -> str:
+    def read(self, chunker_cls: Chunker, max_chunk_size: int) -> str:
        pass
--- a/cognee/modules/data/processing/document_types/ImageDocument.py
+++ b/cognee/modules/data/processing/document_types/ImageDocument.py
@ -11,12 +11,10 @@ class ImageDocument(Document):
        result = get_llm_client().transcribe_image(self.raw_data_location)
        return result.choices[0].message.content

-    def read(self, chunk_size: int, chunker_cls: Chunker, max_chunk_tokens: int):
+    def read(self, chunker_cls: Chunker, max_chunk_size: int):
        # Transcribe the image file
        text = self.transcribe_image()

-        chunker = chunker_cls(
-            self, chunk_size=chunk_size, get_text=lambda: [text], max_chunk_tokens=max_chunk_tokens
-        )
+        chunker = chunker_cls(self, get_text=lambda: [text], max_chunk_size=max_chunk_size)

        yield from chunker.read()
--- a/cognee/modules/data/processing/document_types/PdfDocument.py
+++ b/cognee/modules/data/processing/document_types/PdfDocument.py
@ -7,7 +7,7 @@ from .Document import Document
 class PdfDocument(Document):
    type: str = "pdf"

-    def read(self, chunk_size: int, chunker_cls: Chunker, max_chunk_tokens: int):
+    def read(self, chunker_cls: Chunker, max_chunk_size: int):
        file = PdfReader(self.raw_data_location)

        def get_text():
@ -15,9 +15,7 @@ class PdfDocument(Document):
                page_text = page.extract_text()
                yield page_text

-        chunker = chunker_cls(
-            self, chunk_size=chunk_size, get_text=get_text, max_chunk_tokens=max_chunk_tokens
-        )
+        chunker = chunker_cls(self, get_text=get_text, max_chunk_size=max_chunk_size)

        yield from chunker.read()

--- a/cognee/modules/data/processing/document_types/TextDocument.py
+++ b/cognee/modules/data/processing/document_types/TextDocument.py
@ -5,7 +5,7 @@ from cognee.modules.chunking.Chunker import Chunker
 class TextDocument(Document):
    type: str = "text"

-    def read(self, chunk_size: int, chunker_cls: Chunker, max_chunk_tokens: int):
+    def read(self, chunker_cls: Chunker, max_chunk_size: int):
        def get_text():
            with open(self.raw_data_location, mode="r", encoding="utf-8") as file:
                while True:
@ -16,8 +16,6 @@ class TextDocument(Document):

                    yield text

-        chunker = chunker_cls(
-            self, chunk_size=chunk_size, get_text=get_text, max_chunk_tokens=max_chunk_tokens
-        )
+        chunker = chunker_cls(self, max_chunk_size=max_chunk_size, get_text=get_text)

        yield from chunker.read()
--- a/cognee/modules/data/processing/document_types/UnstructuredDocument.py
+++ b/cognee/modules/data/processing/document_types/UnstructuredDocument.py
@ -9,7 +9,7 @@ from .Document import Document
 class UnstructuredDocument(Document):
    type: str = "unstructured"

-    def read(self, chunk_size: int, chunker_cls: Chunker, max_chunk_tokens: int) -> str:
+    def read(self, chunker_cls: Chunker, max_chunk_size: int) -> str:
        def get_text():
            try:
                from unstructured.partition.auto import partition
@ -28,8 +28,6 @@ class UnstructuredDocument(Document):

                yield text

-        chunker = chunker_cls(
-            self, chunk_size=chunk_size, get_text=get_text, max_chunk_tokens=max_chunk_tokens
-        )
+        chunker = chunker_cls(self, get_text=get_text, max_chunk_size=max_chunk_size)

        yield from chunker.read()
--- a/cognee/tasks/chunks/chunk_by_paragraph.py
+++ b/cognee/tasks/chunks/chunk_by_paragraph.py
@ -1,15 +1,12 @@
 from typing import Any, Dict, Iterator
 from uuid import NAMESPACE_OID, uuid5

-from cognee.infrastructure.databases.vector import get_vector_engine
-
 from .chunk_by_sentence import chunk_by_sentence


 def chunk_by_paragraph(
    data: str,
-    max_chunk_tokens,
-    paragraph_length: int = 1024,
+    max_chunk_size,
    batch_paragraphs: bool = True,
 ) -> Iterator[Dict[str, Any]]:
    """
@ -23,28 +20,19 @@ def chunk_by_paragraph(
        - Remaining text at the end of the input will be yielded as a final chunk.
    """
    current_chunk = ""
-    current_word_count = 0
    chunk_index = 0
    paragraph_ids = []
    last_cut_type = None
-    current_token_count = 0
+    current_chunk_size = 0

-    for paragraph_id, sentence, word_count, end_type in chunk_by_sentence(
-        data, maximum_length=paragraph_length
+    for paragraph_id, sentence, sentence_size, end_type in chunk_by_sentence(
+        data, maximum_size=max_chunk_size
    ):
-        # Check if this sentence would exceed length limit
-        embedding_engine = get_vector_engine().embedding_engine
-        token_count = embedding_engine.tokenizer.count_tokens(sentence)
-
-        if current_word_count > 0 and (
-            current_word_count + word_count > paragraph_length
-            or current_token_count + token_count > max_chunk_tokens
-        ):
+        if current_chunk_size > 0 and (current_chunk_size + sentence_size > max_chunk_size):
            # Yield current chunk
            chunk_dict = {
                "text": current_chunk,
-                "word_count": current_word_count,
-                "token_count": current_token_count,
+                "chunk_size": current_chunk_size,
                "chunk_id": uuid5(NAMESPACE_OID, current_chunk),
                "paragraph_ids": paragraph_ids,
                "chunk_index": chunk_index,
@ -56,22 +44,19 @@ def chunk_by_paragraph(
            # Start new chunk with current sentence
            paragraph_ids = []
            current_chunk = ""
-            current_word_count = 0
-            current_token_count = 0
+            current_chunk_size = 0
            chunk_index += 1

        paragraph_ids.append(paragraph_id)
        current_chunk += sentence
-        current_word_count += word_count
-        current_token_count += token_count
+        current_chunk_size += sentence_size

        # Handle end of paragraph
        if end_type in ("paragraph_end", "sentence_cut") and not batch_paragraphs:
            # For non-batch mode, yield each paragraph separately
            chunk_dict = {
                "text": current_chunk,
-                "word_count": current_word_count,
-                "token_count": current_token_count,
+                "chunk_size": current_chunk_size,
                "paragraph_ids": paragraph_ids,
                "chunk_id": uuid5(NAMESPACE_OID, current_chunk),
                "chunk_index": chunk_index,
@ -80,8 +65,7 @@ def chunk_by_paragraph(
            yield chunk_dict
            paragraph_ids = []
            current_chunk = ""
-            current_word_count = 0
-            current_token_count = 0
+            current_chunk_size = 0
            chunk_index += 1

        last_cut_type = end_type
@ -90,8 +74,7 @@ def chunk_by_paragraph(
    if current_chunk:
        chunk_dict = {
            "text": current_chunk,
-            "word_count": current_word_count,
-            "token_count": current_token_count,
+            "chunk_size": current_chunk_size,
            "chunk_id": uuid5(NAMESPACE_OID, current_chunk),
            "paragraph_ids": paragraph_ids,
            "chunk_index": chunk_index,
--- a/cognee/tasks/chunks/chunk_by_sentence.py
+++ b/cognee/tasks/chunks/chunk_by_sentence.py
@ -1,10 +1,19 @@
 from uuid import uuid4, UUID
 from typing import Optional, Iterator, Tuple
 from .chunk_by_word import chunk_by_word
+from cognee.infrastructure.databases.vector.embeddings import get_embedding_engine
+
+
+def get_word_size(word: str) -> int:
+    embedding_engine = get_embedding_engine()
+    if embedding_engine.tokenizer:
+        return embedding_engine.tokenizer.count_tokens(word)
+    else:
+        return 1


 def chunk_by_sentence(
-    data: str, maximum_length: Optional[int] = None
+    data: str, maximum_size: Optional[int] = None
 ) -> Iterator[Tuple[UUID, str, int, Optional[str]]]:
    """
    Splits the input text into sentences based on word-level processing, with optional sentence length constraints.
@ -16,7 +25,7 @@ def chunk_by_sentence(
    """
    sentence = ""
    paragraph_id = uuid4()
-    word_count = 0
+    sentence_size = 0
    section_end = False
    word_type_state = None

@ -25,8 +34,7 @@ def chunk_by_sentence(
    # and words with the same characteristics connect it to a preceding
    # word with word_type 'paragraph_end' or 'sentence_end'
    for word, word_type in chunk_by_word(data):
-        sentence += word
-        word_count += 1
+        word_size = get_word_size(word)

        if word_type in ["paragraph_end", "sentence_end"]:
            word_type_state = word_type
@ -36,19 +44,31 @@ def chunk_by_sentence(
                    word_type_state = word_type
                    break

-        if word_type in ["paragraph_end", "sentence_end"] or (
-            maximum_length and (word_count == maximum_length)
-        ):
-            yield (paragraph_id, sentence, word_count, word_type_state)
-            sentence = ""
-            word_count = 0
+        if maximum_size and (sentence_size + word_size > maximum_size):
+            yield (paragraph_id, sentence, sentence_size, word_type_state)
+            sentence = word
+            sentence_size = word_size
+
+        elif word_type in ["paragraph_end", "sentence_end"]:
+            sentence += word
+            sentence_size += word_size
            paragraph_id = uuid4() if word_type == "paragraph_end" else paragraph_id

+            yield (paragraph_id, sentence, sentence_size, word_type_state)
+            sentence = ""
+            sentence_size = 0
+        else:
+            sentence += word
+            sentence_size += word_size
+
    if len(sentence) > 0:
+        if maximum_size and sentence_size > maximum_size:
+            raise ValueError(f"Input word {word} longer than chunking size {maximum_size}.")
+
        section_end = "sentence_cut" if word_type_state == "word" else word_type_state
        yield (
            paragraph_id,
            sentence,
-            word_count,
+            sentence_size,
            section_end,
        )
--- a/cognee/tasks/chunks/chunk_by_word.py
+++ b/cognee/tasks/chunks/chunk_by_word.py
@ -2,7 +2,7 @@ import re
 from typing import Iterator, Tuple


-SENTENCE_ENDINGS = r"[.;!?…]"
+SENTENCE_ENDINGS = r"[.;!?…。！？]"
 PARAGRAPH_ENDINGS = r"[\n\r]"


--- a/cognee/tasks/documents/extract_chunks_from_documents.py
+++ b/cognee/tasks/documents/extract_chunks_from_documents.py
@ -26,8 +26,7 @@ async def update_document_token_count(document_id: UUID, token_count: int) -> No

 async def extract_chunks_from_documents(
    documents: list[Document],
-    max_chunk_tokens: int,
-    chunk_size: int = 1024,
+    max_chunk_size: int,
    chunker: Chunker = TextChunker,
 ) -> AsyncGenerator:
    """
@ -39,10 +38,9 @@ async def extract_chunks_from_documents(
    """
    for document in documents:
        document_token_count = 0
-        for document_chunk in document.read(
-            chunk_size=chunk_size, chunker_cls=chunker, max_chunk_tokens=max_chunk_tokens
-        ):
-            document_token_count += document_chunk.token_count
+        for document_chunk in document.read(max_chunk_size=max_chunk_size, chunker_cls=chunker):
+            document_token_count += document_chunk.chunk_size
            yield document_chunk

        await update_document_token_count(document.id, document_token_count)
+        # todo rita
--- a/cognee/tests/integration/documents/AudioDocument_test.py
+++ b/cognee/tests/integration/documents/AudioDocument_test.py
@ -2,6 +2,17 @@ import uuid
 from unittest.mock import patch
 from cognee.modules.chunking.TextChunker import TextChunker
 from cognee.modules.data.processing.document_types.AudioDocument import AudioDocument
+import sys
+
+chunk_by_sentence_module = sys.modules.get("cognee.tasks.chunks.chunk_by_sentence")
+
+
+def mock_get_embedding_engine():
+    class MockEngine:
+        tokenizer = None
+
+    return MockEngine()
+

 GROUND_TRUTH = [
    {"word_count": 57, "len_text": 353, "cut_type": "sentence_end"},
@ -24,7 +35,10 @@ TEST_TEXT = """
 "The feature ships, Sarah. That's final.\""""


-def test_AudioDocument():
+@patch.object(
+    chunk_by_sentence_module, "get_embedding_engine", side_effect=mock_get_embedding_engine
+)
+def test_AudioDocument(mock_engine):
    document = AudioDocument(
        id=uuid.uuid4(),
        name="audio-dummy-test",
@ -35,10 +49,10 @@ def test_AudioDocument():
    with patch.object(AudioDocument, "create_transcript", return_value=TEST_TEXT):
        for ground_truth, paragraph_data in zip(
            GROUND_TRUTH,
-            document.read(chunk_size=64, chunker_cls=TextChunker, max_chunk_tokens=512),
+            document.read(chunker_cls=TextChunker, max_chunk_size=64),
        ):
-            assert ground_truth["word_count"] == paragraph_data.word_count, (
-                f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }'
+            assert ground_truth["word_count"] == paragraph_data.chunk_size, (
+                f'{ground_truth["word_count"] = } != {paragraph_data.chunk_size = }'
            )
            assert ground_truth["len_text"] == len(paragraph_data.text), (
                f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }'
--- a/cognee/tests/integration/documents/ImageDocument_test.py
+++ b/cognee/tests/integration/documents/ImageDocument_test.py
@ -2,6 +2,11 @@ import uuid
 from unittest.mock import patch
 from cognee.modules.chunking.TextChunker import TextChunker
 from cognee.modules.data.processing.document_types.ImageDocument import ImageDocument
+from cognee.tests.integration.documents.AudioDocument_test import mock_get_embedding_engine
+import sys
+
+chunk_by_sentence_module = sys.modules.get("cognee.tasks.chunks.chunk_by_sentence")
+

 GROUND_TRUTH = [
    {"word_count": 51, "len_text": 298, "cut_type": "sentence_end"},
@ -13,7 +18,10 @@ TEST_TEXT = """A dramatic confrontation unfolds as a red fox and river otter eng
 The commotion has attracted an audience: a murder of crows has gathered in the low branches, their harsh calls adding to the chaos as they hop excitedly from limb to limb. One particularly bold crow dive-bombs the wrestling pair, causing both animals to momentarily freeze mid-tussle, creating a perfect snapshot of suspended action—the fox's fur dripping wet, the otter's body coiled like a spring, and the crow's wings spread wide against the golden morning light."""


-def test_ImageDocument():
+@patch.object(
+    chunk_by_sentence_module, "get_embedding_engine", side_effect=mock_get_embedding_engine
+)
+def test_ImageDocument(mock_engine):
    document = ImageDocument(
        id=uuid.uuid4(),
        name="image-dummy-test",
@ -24,10 +32,10 @@ def test_ImageDocument():
    with patch.object(ImageDocument, "transcribe_image", return_value=TEST_TEXT):
        for ground_truth, paragraph_data in zip(
            GROUND_TRUTH,
-            document.read(chunk_size=64, chunker_cls=TextChunker, max_chunk_tokens=512),
+            document.read(chunker_cls=TextChunker, max_chunk_size=64),
        ):
-            assert ground_truth["word_count"] == paragraph_data.word_count, (
-                f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }'
+            assert ground_truth["word_count"] == paragraph_data.chunk_size, (
+                f'{ground_truth["word_count"] = } != {paragraph_data.chunk_size = }'
            )
            assert ground_truth["len_text"] == len(paragraph_data.text), (
                f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }'
--- a/cognee/tests/integration/documents/PdfDocument_test.py
+++ b/cognee/tests/integration/documents/PdfDocument_test.py
@ -2,6 +2,12 @@ import os
 import uuid
 from cognee.modules.chunking.TextChunker import TextChunker
 from cognee.modules.data.processing.document_types.PdfDocument import PdfDocument
+from cognee.tests.integration.documents.AudioDocument_test import mock_get_embedding_engine
+from unittest.mock import patch
+import sys
+
+chunk_by_sentence_module = sys.modules.get("cognee.tasks.chunks.chunk_by_sentence")
+

 GROUND_TRUTH = [
    {"word_count": 879, "len_text": 5607, "cut_type": "sentence_end"},
@ -9,7 +15,10 @@ GROUND_TRUTH = [
 ]


-def test_PdfDocument():
+@patch.object(
+    chunk_by_sentence_module, "get_embedding_engine", side_effect=mock_get_embedding_engine
+)
+def test_PdfDocument(mock_engine):
    test_file_path = os.path.join(
        os.sep,
        *(os.path.dirname(__file__).split(os.sep)[:-2]),
@ -25,10 +34,10 @@ def test_PdfDocument():
    )

    for ground_truth, paragraph_data in zip(
-        GROUND_TRUTH, document.read(chunk_size=1024, chunker_cls=TextChunker, max_chunk_tokens=2048)
+        GROUND_TRUTH, document.read(chunker_cls=TextChunker, max_chunk_size=1024)
    ):
-        assert ground_truth["word_count"] == paragraph_data.word_count, (
-            f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }'
+        assert ground_truth["word_count"] == paragraph_data.chunk_size, (
+            f'{ground_truth["word_count"] = } != {paragraph_data.chunk_size = }'
        )
        assert ground_truth["len_text"] == len(paragraph_data.text), (
            f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }'
--- a/cognee/tests/integration/documents/TextDocument_test.py
+++ b/cognee/tests/integration/documents/TextDocument_test.py
@ -4,6 +4,12 @@ import uuid
 import pytest
 from cognee.modules.chunking.TextChunker import TextChunker
 from cognee.modules.data.processing.document_types.TextDocument import TextDocument
+from unittest.mock import patch
+from cognee.tests.integration.documents.AudioDocument_test import mock_get_embedding_engine
+import sys
+
+chunk_by_sentence_module = sys.modules.get("cognee.tasks.chunks.chunk_by_sentence")
+

 GROUND_TRUTH = {
    "code.txt": [
@ -21,7 +27,10 @@ GROUND_TRUTH = {
    "input_file,chunk_size",
    [("code.txt", 256), ("Natural_language_processing.txt", 128)],
 )
-def test_TextDocument(input_file, chunk_size):
+@patch.object(
+    chunk_by_sentence_module, "get_embedding_engine", side_effect=mock_get_embedding_engine
+)
+def test_TextDocument(mock_engine, input_file, chunk_size):
    test_file_path = os.path.join(
        os.sep,
        *(os.path.dirname(__file__).split(os.sep)[:-2]),
@ -38,10 +47,10 @@ def test_TextDocument(input_file, chunk_size):

    for ground_truth, paragraph_data in zip(
        GROUND_TRUTH[input_file],
-        document.read(chunk_size=chunk_size, chunker_cls=TextChunker, max_chunk_tokens=1024),
+        document.read(chunker_cls=TextChunker, max_chunk_size=chunk_size),
    ):
-        assert ground_truth["word_count"] == paragraph_data.word_count, (
-            f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }'
+        assert ground_truth["word_count"] == paragraph_data.chunk_size, (
+            f'{ground_truth["word_count"] = } != {paragraph_data.chunk_size = }'
        )
        assert ground_truth["len_text"] == len(paragraph_data.text), (
            f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }'
--- a/cognee/tests/integration/documents/UnstructuredDocument_test.py
+++ b/cognee/tests/integration/documents/UnstructuredDocument_test.py
@ -1,10 +1,18 @@
 import os
 import uuid
+from unittest.mock import patch
 from cognee.modules.chunking.TextChunker import TextChunker
 from cognee.modules.data.processing.document_types.UnstructuredDocument import UnstructuredDocument
+from cognee.tests.integration.documents.AudioDocument_test import mock_get_embedding_engine
+import sys
+
+chunk_by_sentence_module = sys.modules.get("cognee.tasks.chunks.chunk_by_sentence")


-def test_UnstructuredDocument():
+@patch.object(
+    chunk_by_sentence_module, "get_embedding_engine", side_effect=mock_get_embedding_engine
+)
+def test_UnstructuredDocument(mock_engine):
    # Define file paths of test data
    pptx_file_path = os.path.join(
        os.sep,
@ -68,30 +76,24 @@ def test_UnstructuredDocument():
    )

    # Test PPTX
-    for paragraph_data in pptx_document.read(
-        chunk_size=1024, chunker_cls=TextChunker, max_chunk_tokens=1024
-    ):
-        assert 19 == paragraph_data.word_count, f" 19 != {paragraph_data.word_count = }"
+    for paragraph_data in pptx_document.read(chunker_cls=TextChunker, max_chunk_size=1024):
+        assert 19 == paragraph_data.chunk_size, f" 19 != {paragraph_data.chunk_size = }"
        assert 104 == len(paragraph_data.text), f" 104 != {len(paragraph_data.text) = }"
        assert "sentence_cut" == paragraph_data.cut_type, (
            f" sentence_cut != {paragraph_data.cut_type = }"
        )

    # Test DOCX
-    for paragraph_data in docx_document.read(
-        chunk_size=1024, chunker_cls=TextChunker, max_chunk_tokens=1024
-    ):
-        assert 16 == paragraph_data.word_count, f" 16 != {paragraph_data.word_count = }"
+    for paragraph_data in docx_document.read(chunker_cls=TextChunker, max_chunk_size=1024):
+        assert 16 == paragraph_data.chunk_size, f" 16 != {paragraph_data.chunk_size = }"
        assert 145 == len(paragraph_data.text), f" 145 != {len(paragraph_data.text) = }"
        assert "sentence_end" == paragraph_data.cut_type, (
            f" sentence_end != {paragraph_data.cut_type = }"
        )

    # TEST CSV
-    for paragraph_data in csv_document.read(
-        chunk_size=1024, chunker_cls=TextChunker, max_chunk_tokens=1024
-    ):
-        assert 15 == paragraph_data.word_count, f" 15 != {paragraph_data.word_count = }"
+    for paragraph_data in csv_document.read(chunker_cls=TextChunker, max_chunk_size=1024):
+        assert 15 == paragraph_data.chunk_size, f" 15 != {paragraph_data.chunk_size = }"
        assert "A A A A A A A A A,A A A A A A,A A" == paragraph_data.text, (
            f"Read text doesn't match expected text: {paragraph_data.text}"
        )
@ -100,10 +102,8 @@ def test_UnstructuredDocument():
        )

    # Test XLSX
-    for paragraph_data in xlsx_document.read(
-        chunk_size=1024, chunker_cls=TextChunker, max_chunk_tokens=1024
-    ):
-        assert 36 == paragraph_data.word_count, f" 36 != {paragraph_data.word_count = }"
+    for paragraph_data in xlsx_document.read(chunker_cls=TextChunker, max_chunk_size=1024):
+        assert 36 == paragraph_data.chunk_size, f" 36 != {paragraph_data.chunk_size = }"
        assert 171 == len(paragraph_data.text), f" 171 != {len(paragraph_data.text) = }"
        assert "sentence_cut" == paragraph_data.cut_type, (
            f" sentence_cut != {paragraph_data.cut_type = }"
--- a/cognee/tests/unit/processing/chunks/chunk_by_paragraph_2_test.py
+++ b/cognee/tests/unit/processing/chunks/chunk_by_paragraph_2_test.py
@ -3,29 +3,26 @@ from itertools import product
 import numpy as np
 import pytest

-from cognee.tasks.chunks import chunk_by_paragraph, chunk_by_word
 from cognee.tests.unit.processing.chunks.test_input import INPUT_TEXTS
+from cognee.infrastructure.databases.vector.embeddings import get_embedding_engine
+from cognee.tasks.chunks import chunk_by_paragraph

-paragraph_lengths = [64, 256, 1024]
 batch_paragraphs_vals = [True, False]
-max_chunk_tokens_vals = [512, 1024, 4096]
+max_chunk_size_vals = [512, 1024, 4096]


@pytest.mark.parametrize(
-    "input_text,max_chunk_tokens,paragraph_length,batch_paragraphs",
+    "input_text,max_chunk_size,batch_paragraphs",
    list(
        product(
            list(INPUT_TEXTS.values()),
-            max_chunk_tokens_vals,
-            paragraph_lengths,
+            max_chunk_size_vals,
            batch_paragraphs_vals,
        )
    ),
 )
-def test_chunk_by_paragraph_isomorphism(
-    input_text, max_chunk_tokens, paragraph_length, batch_paragraphs
-):
-    chunks = chunk_by_paragraph(input_text, max_chunk_tokens, paragraph_length, batch_paragraphs)
+def test_chunk_by_paragraph_isomorphism(input_text, max_chunk_size, batch_paragraphs):
+    chunks = chunk_by_paragraph(input_text, max_chunk_size, batch_paragraphs)
    reconstructed_text = "".join([chunk["text"] for chunk in chunks])
    assert reconstructed_text == input_text, (
        f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }"
@ -33,52 +30,49 @@ def test_chunk_by_paragraph_isomorphism(


@pytest.mark.parametrize(
-    "input_text,max_chunk_tokens,paragraph_length,batch_paragraphs",
+    "input_text,max_chunk_size, batch_paragraphs",
    list(
        product(
            list(INPUT_TEXTS.values()),
-            max_chunk_tokens_vals,
-            paragraph_lengths,
+            max_chunk_size_vals,
            batch_paragraphs_vals,
        )
    ),
 )
-def test_paragraph_chunk_length(input_text, max_chunk_tokens, paragraph_length, batch_paragraphs):
+def test_paragraph_chunk_length(input_text, max_chunk_size, batch_paragraphs):
    chunks = list(
        chunk_by_paragraph(
            data=input_text,
-            max_chunk_tokens=max_chunk_tokens,
-            paragraph_length=paragraph_length,
+            max_chunk_size=max_chunk_size,
            batch_paragraphs=batch_paragraphs,
        )
    )
+    embedding_engine = get_embedding_engine()

-    chunk_lengths = np.array([len(list(chunk_by_word(chunk["text"]))) for chunk in chunks])
+    chunk_lengths = np.array(
+        [embedding_engine.tokenizer.count_tokens(chunk["text"]) for chunk in chunks]
+    )

-    larger_chunks = chunk_lengths[chunk_lengths > paragraph_length]
-    assert np.all(chunk_lengths <= paragraph_length), (
-        f"{paragraph_length = }: {larger_chunks} are too large"
+    larger_chunks = chunk_lengths[chunk_lengths > max_chunk_size]
+    assert np.all(chunk_lengths <= max_chunk_size), (
+        f"{max_chunk_size = }: {larger_chunks} are too large"
    )


@pytest.mark.parametrize(
-    "input_text,max_chunk_tokens,paragraph_length,batch_paragraphs",
+    "input_text,max_chunk_size,batch_paragraphs",
    list(
        product(
            list(INPUT_TEXTS.values()),
-            max_chunk_tokens_vals,
-            paragraph_lengths,
+            max_chunk_size_vals,
            batch_paragraphs_vals,
        )
    ),
 )
-def test_chunk_by_paragraph_chunk_numbering(
-    input_text, max_chunk_tokens, paragraph_length, batch_paragraphs
-):
+def test_chunk_by_paragraph_chunk_numbering(input_text, max_chunk_size, batch_paragraphs):
    chunks = chunk_by_paragraph(
        data=input_text,
-        max_chunk_tokens=max_chunk_tokens,
-        paragraph_length=paragraph_length,
+        max_chunk_size=max_chunk_size,
        batch_paragraphs=batch_paragraphs,
    )
    chunk_indices = np.array([chunk["chunk_index"] for chunk in chunks])
--- a/cognee/tests/unit/processing/chunks/chunk_by_paragraph_test.py
+++ b/cognee/tests/unit/processing/chunks/chunk_by_paragraph_test.py
@ -1,37 +1,49 @@
+from unittest.mock import patch
 from cognee.tasks.chunks import chunk_by_paragraph
+import sys
+
+chunk_by_sentence_module = sys.modules.get("cognee.tasks.chunks.chunk_by_sentence")
+
+
+def mock_get_embedding_engine():
+    class MockEngine:
+        tokenizer = None
+
+    return MockEngine()
+

 GROUND_TRUTH = {
    "whole_text": [
        {
            "text": "This is example text. It contains multiple sentences.",
-            "word_count": 8,
+            "chunk_size": 8,
            "cut_type": "paragraph_end",
        },
        {
            "text": "\nThis is a second paragraph. First two paragraphs are whole.",
-            "word_count": 10,
+            "chunk_size": 10,
            "cut_type": "paragraph_end",
        },
        {
            "text": "\nThird paragraph is a bit longer and is finished with a dot.",
-            "word_count": 12,
+            "chunk_size": 12,
            "cut_type": "sentence_end",
        },
    ],
    "cut_text": [
        {
            "text": "This is example text. It contains multiple sentences.",
-            "word_count": 8,
+            "chunk_size": 8,
            "cut_type": "paragraph_end",
        },
        {
            "text": "\nThis is a second paragraph. First two paragraphs are whole.",
-            "word_count": 10,
+            "chunk_size": 10,
            "cut_type": "paragraph_end",
        },
        {
            "text": "\nThird paragraph is cut and is missing the dot at the end",
-            "word_count": 12,
+            "chunk_size": 12,
            "cut_type": "sentence_cut",
        },
    ],
@ -47,17 +59,18 @@ Third paragraph is cut and is missing the dot at the end""",
 }


-def run_chunking_test(test_text, expected_chunks):
+@patch.object(
+    chunk_by_sentence_module, "get_embedding_engine", side_effect=mock_get_embedding_engine
+)
+def run_chunking_test(test_text, expected_chunks, mock_engine):
    chunks = []
-    for chunk_data in chunk_by_paragraph(
-        data=test_text, paragraph_length=12, batch_paragraphs=False, max_chunk_tokens=512
-    ):
+    for chunk_data in chunk_by_paragraph(data=test_text, batch_paragraphs=False, max_chunk_size=12):
        chunks.append(chunk_data)

    assert len(chunks) == 3

    for expected_chunks_item, chunk in zip(expected_chunks, chunks):
-        for key in ["text", "word_count", "cut_type"]:
+        for key in ["text", "chunk_size", "cut_type"]:
            assert chunk[key] == expected_chunks_item[key], (
                f"{key = }: {chunk[key] = } != {expected_chunks_item[key] = }"
            )
--- a/cognee/tests/unit/processing/chunks/chunk_by_sentence_test.py
+++ b/cognee/tests/unit/processing/chunks/chunk_by_sentence_test.py
@ -3,10 +3,11 @@ from itertools import product
 import numpy as np
 import pytest

-from cognee.tasks.chunks import chunk_by_sentence, chunk_by_word
-from cognee.tests.unit.processing.chunks.test_input import INPUT_TEXTS
+from cognee.tasks.chunks import chunk_by_sentence
+from cognee.tests.unit.processing.chunks.test_input import INPUT_TEXTS_LONGWORDS, INPUT_TEXTS
+from cognee.infrastructure.databases.vector.embeddings import get_embedding_engine

-maximum_length_vals = [None, 8, 64]
+maximum_length_vals = [None, 16, 64]


@pytest.mark.parametrize(
@ -33,9 +34,26 @@ def test_chunk_by_sentence_isomorphism(input_text, maximum_length):
 def test_paragraph_chunk_length(input_text, maximum_length):
    chunks = list(chunk_by_sentence(input_text, maximum_length))

-    chunk_lengths = np.array([len(list(chunk_by_word(chunk[1]))) for chunk in chunks])
+    embedding_engine = get_embedding_engine()
+    chunk_lengths = np.array(
+        [embedding_engine.tokenizer.count_tokens(chunk[1]) for chunk in chunks]
+    )

    larger_chunks = chunk_lengths[chunk_lengths > maximum_length]
    assert np.all(chunk_lengths <= maximum_length), (
        f"{maximum_length = }: {larger_chunks} are too large"
    )
+
+
+@pytest.mark.parametrize(
+    "input_text,maximum_length",
+    list(
+        product(
+            list(INPUT_TEXTS_LONGWORDS.values()),
+            [val for val in maximum_length_vals if val is not None],
+        )
+    ),
+)
+def test_paragraph_chunk_long_input(input_text, maximum_length):
+    with pytest.raises(ValueError):
+        list(chunk_by_sentence(input_text, maximum_length))
--- a/cognee/tests/unit/processing/chunks/chunk_by_word_test.py
+++ b/cognee/tests/unit/processing/chunks/chunk_by_word_test.py
@ -2,7 +2,7 @@ import numpy as np
 import pytest

 from cognee.tasks.chunks import chunk_by_word
-from cognee.tests.unit.processing.chunks.test_input import INPUT_TEXTS
+from cognee.tests.unit.processing.chunks.test_input import INPUT_TEXTS, INPUT_TEXTS_LONGWORDS


@pytest.mark.parametrize(
@ -11,7 +11,7 @@ from cognee.tests.unit.processing.chunks.test_input import INPUT_TEXTS
        INPUT_TEXTS["english_text"],
        INPUT_TEXTS["english_lists"],
        INPUT_TEXTS["python_code"],
-        INPUT_TEXTS["chinese_text"],
+        INPUT_TEXTS_LONGWORDS["chinese_text"],
    ],
 )
 def test_chunk_by_word_isomorphism(input_text):
@ -28,7 +28,7 @@ def test_chunk_by_word_isomorphism(input_text):
        INPUT_TEXTS["english_text"],
        INPUT_TEXTS["english_lists"],
        INPUT_TEXTS["python_code"],
-        INPUT_TEXTS["chinese_text"],
+        INPUT_TEXTS_LONGWORDS["chinese_text"],
    ],
 )
 def test_chunk_by_word_splits(input_text):
--- a/cognee/tests/unit/processing/chunks/test_input.py
+++ b/cognee/tests/unit/processing/chunks/test_input.py
@ -179,7 +179,6 @@ def pad(
    mode: _ModeFunc,
    **kwargs: Any,
 ) -> NDArray[Any]: ...""",
-    "chinese_text": """在这个繁华的城市里，藏着一个古老的小巷，名叫杨柳巷。巷子两旁的青石板路已经被无数行人的脚步磨得发亮，斑驳的老墙上爬满了常青藤，给这个充满历史气息的小巷增添了一抹生机。每天清晨，巷子里都会飘出阵阵香气，那是张婆婆家的早点铺子散发出的包子和豆浆的味道。老店门前经常排着长队，有步履匆匆的上班族，也有悠闲散步的老人。巷子深处有一家传统的茶馆，古色古香的木桌椅上总是坐满了品茶聊天的街坊邻里。傍晚时分，夕阳的余晖洒在石板路上，为这个充满生活气息的小巷染上一层温暖的金色。街角的老榕树下，常常有卖唱的艺人在这里驻足，用沧桑的嗓音讲述着这座城市的故事。偶尔，还能看到三三两两的游客举着相机，试图捕捉这里独特的市井风情。这条看似普通的小巷，承载着太多市民的回忆和岁月的痕迹，它就像是这座城市的一个缩影，悄悄地诉说着曾经的故事。""",
    "english_text": """O for that warning voice, which he who saw
 Th' Apocalyps, heard cry in Heaven aloud,
 Then when the Dragon, put to second rout,
@ -282,3 +281,7 @@ For never can true reconcilement grow
 Where wounds of deadly hate have peirc'd so deep:
 Which would but lead me to a worse relapse [ 100 ]""",
 }
+
+INPUT_TEXTS_LONGWORDS = {
+    "chinese_text": """在这个繁华的城市里，藏着一个古老的小巷，名叫杨柳巷。巷子两旁的青石板路已经被无数行人的脚步磨得发亮，斑驳的老墙上爬满了常青藤，给这个充满历史气息的小巷增添了一抹生机。每天清晨，巷子里都会飘出阵阵香气，那是张婆婆家的早点铺子散发出的包子和豆浆的味道。老店门前经常排着长队，有步履匆匆的上班族，也有悠闲散步的老人。巷子深处有一家传统的茶馆，古色古香的木桌椅上总是坐满了品茶聊天的街坊邻里。傍晚时分，夕阳的余晖洒在石板路上，为这个充满生活气息的小巷染上一层温暖的金色。街角的老榕树下，常常有卖唱的艺人在这里驻足，用沧桑的嗓音讲述着这座城市的故事。偶尔，还能看到三三两两的游客举着相机，试图捕捉这里独特的市井风情。这条看似普通的小巷，承载着太多市民的回忆和岁月的痕迹，它就像是这座城市的一个缩影，悄悄地诉说着曾经的故事。""",
+}
--- a/notebooks/cognee_demo.ipynb
+++ b/notebooks/cognee_demo.ipynb