from cognee.shared.logging_utils import get_logger from uuid import NAMESPACE_OID, uuid5 from cognee.tasks.chunks import chunk_by_paragraph from cognee.modules.chunking.Chunker import Chunker from .models.DocumentChunk import DocumentChunk logger = get_logger() class TextChunker(Chunker): async def read(self): paragraph_chunks = [] async for content_text in self.get_text(): for chunk_data in chunk_by_paragraph( content_text, self.max_chunk_size, batch_paragraphs=True, ): if self.chunk_size + chunk_data["chunk_size"] <= self.max_chunk_size: paragraph_chunks.append(chunk_data) self.chunk_size += chunk_data["chunk_size"] else: if len(paragraph_chunks) == 0: yield DocumentChunk( id=chunk_data["chunk_id"], text=chunk_data["text"], chunk_size=chunk_data["chunk_size"], is_part_of=self.document, chunk_index=self.chunk_index, cut_type=chunk_data["cut_type"], contains=[], metadata={ "index_fields": ["text"], }, ) paragraph_chunks = [] self.chunk_size = 0 else: chunk_text = " ".join(chunk["text"] for chunk in paragraph_chunks) try: yield DocumentChunk( id=uuid5( NAMESPACE_OID, f"{str(self.document.id)}-{self.chunk_index}" ), text=chunk_text, chunk_size=self.chunk_size, is_part_of=self.document, chunk_index=self.chunk_index, cut_type=paragraph_chunks[len(paragraph_chunks) - 1]["cut_type"], contains=[], metadata={ "index_fields": ["text"], }, ) except Exception as e: logger.error(e) raise e paragraph_chunks = [chunk_data] self.chunk_size = chunk_data["chunk_size"] self.chunk_index += 1 if len(paragraph_chunks) > 0: try: yield DocumentChunk( id=uuid5(NAMESPACE_OID, f"{str(self.document.id)}-{self.chunk_index}"), text=" ".join(chunk["text"] for chunk in paragraph_chunks), chunk_size=self.chunk_size, is_part_of=self.document, chunk_index=self.chunk_index, cut_type=paragraph_chunks[len(paragraph_chunks) - 1]["cut_type"], contains=[], metadata={"index_fields": ["text"]}, ) except Exception as e: logger.error(e) raise e