diff --git a/cognee/infrastructure/data/chunking/LangchainChunkingEngine.py b/cognee/infrastructure/data/chunking/LangchainChunkingEngine.py index c936bbe66..3b821df9f 100644 --- a/cognee/infrastructure/data/chunking/LangchainChunkingEngine.py +++ b/cognee/infrastructure/data/chunking/LangchainChunkingEngine.py @@ -29,6 +29,9 @@ class LangchainChunkEngine(): if chunk_strategy == ChunkStrategy.CODE: chunked_data = LangchainChunkEngine.chunk_data_by_code(source_data,chunk_size, chunk_overlap) + + elif chunk_strategy == ChunkStrategy.LANGCHAIN_CHARACTER: + chunked_data = LangchainChunkEngine.chunk_data_by_character(source_data,chunk_size, chunk_overlap) else: chunked_data = DefaultChunkEngine.chunk_data_by_paragraph(source_data,chunk_size, chunk_overlap) return chunked_data @@ -50,3 +53,12 @@ class LangchainChunkEngine(): return only_content + def chunk_data_by_character(self, data_chunks, chunk_size, chunk_overlap): + from langchain_text_splitters import RecursiveCharacterTextSplitter + splitter = RecursiveCharacterTextSplitter(chunk_size, chunk_overlap) + data = splitter.split(data_chunks) + + only_content = [chunk.page_content for chunk in data] + + return only_content + diff --git a/cognee/shared/data_models.py b/cognee/shared/data_models.py index 7e228bf8b..906cd537d 100644 --- a/cognee/shared/data_models.py +++ b/cognee/shared/data_models.py @@ -35,6 +35,7 @@ class ChunkStrategy(Enum): PARAGRAPH = "paragraph" SENTENCE = "sentence" CODE = "code" + LANGCHAIN_CHARACTER = "langchain_character" class MemorySummary(BaseModel): """ Memory summary. """ diff --git a/docs/blog/posts/llmops-and-knowledge-graphs.md b/docs/blog/posts/llmops-and-knowledge-graphs.md new file mode 100644 index 000000000..e69de29bb