cognee/cognee/modules/chunking/LangchainChunker.py
Boris 46c4463cb2
feat: s3 storage (#988)
<!-- .github/pull_request_template.md -->

## Description
<!-- Provide a clear description of the changes in this PR -->

## DCO Affirmation
I affirm that all code in every commit of this pull request conforms to
the terms of the Topoteretes Developer Certificate of Origin.

---------

Co-authored-by: vasilije <vas.markovic@gmail.com>
Co-authored-by: Vasilije <8619304+Vasilije1990@users.noreply.github.com>
2025-07-14 21:47:08 +02:00

59 lines
2.3 KiB
Python

from cognee.shared.logging_utils import get_logger
from uuid import NAMESPACE_OID, uuid5
from cognee.modules.chunking.Chunker import Chunker
from .models.DocumentChunk import DocumentChunk
from langchain_text_splitters import RecursiveCharacterTextSplitter
from cognee.infrastructure.databases.vector import get_vector_engine
logger = get_logger()
class LangchainChunker(Chunker):
"""
A Chunker that splits text into chunks using Langchain's RecursiveCharacterTextSplitter.
The chunker will split the text into chunks of approximately the given size, but will not split
a chunk if the split would result in a chunk with fewer than the given overlap tokens.
"""
def __init__(
self,
document,
get_text: callable,
max_chunk_tokens: int,
chunk_size: int = 1024,
chunk_overlap=10,
):
super().__init__(document, get_text, max_chunk_tokens, chunk_size)
self.splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
length_function=lambda text: len(text.split()),
)
async def read(self):
async for content_text in self.get_text():
for chunk in self.splitter.split_text(content_text):
embedding_engine = get_vector_engine().embedding_engine
token_count = embedding_engine.tokenizer.count_tokens(chunk)
if token_count <= self.max_chunk_tokens:
yield DocumentChunk(
id=uuid5(NAMESPACE_OID, chunk),
text=chunk,
word_count=len(chunk.split()),
token_count=token_count,
is_part_of=self.document,
chunk_index=self.chunk_index,
cut_type="missing",
contains=[],
metadata={
"index_fields": ["text"],
},
)
self.chunk_index += 1
else:
raise ValueError(
f"Chunk of {token_count} tokens is larger than the maximum of {self.max_chunk_tokens} tokens. Please reduce chunk_size in RecursiveCharacterTextSplitter."
)