From 6bf38e07849c64e6775de6ae1fe530d5eaaaf208 Mon Sep 17 00:00:00 2001 From: Vasilije <8619304+Vasilije1990@users.noreply.github.com> Date: Fri, 7 Jun 2024 09:53:20 +0200 Subject: [PATCH] extend langchain splitter with character splitter --- .../data/chunking/LangchainChunkingEngine.py | 12 ++++++++++++ cognee/shared/data_models.py | 1 + docs/blog/posts/llmops-and-knowledge-graphs.md | 0 3 files changed, 13 insertions(+) create mode 100644 docs/blog/posts/llmops-and-knowledge-graphs.md diff --git a/cognee/infrastructure/data/chunking/LangchainChunkingEngine.py b/cognee/infrastructure/data/chunking/LangchainChunkingEngine.py index c936bbe66..3b821df9f 100644 --- a/cognee/infrastructure/data/chunking/LangchainChunkingEngine.py +++ b/cognee/infrastructure/data/chunking/LangchainChunkingEngine.py @@ -29,6 +29,9 @@ class LangchainChunkEngine(): if chunk_strategy == ChunkStrategy.CODE: chunked_data = LangchainChunkEngine.chunk_data_by_code(source_data,chunk_size, chunk_overlap) + + elif chunk_strategy == ChunkStrategy.LANGCHAIN_CHARACTER: + chunked_data = LangchainChunkEngine.chunk_data_by_character(source_data,chunk_size, chunk_overlap) else: chunked_data = DefaultChunkEngine.chunk_data_by_paragraph(source_data,chunk_size, chunk_overlap) return chunked_data @@ -50,3 +53,12 @@ class LangchainChunkEngine(): return only_content + def chunk_data_by_character(self, data_chunks, chunk_size, chunk_overlap): + from langchain_text_splitters import RecursiveCharacterTextSplitter + splitter = RecursiveCharacterTextSplitter(chunk_size, chunk_overlap) + data = splitter.split(data_chunks) + + only_content = [chunk.page_content for chunk in data] + + return only_content + diff --git a/cognee/shared/data_models.py b/cognee/shared/data_models.py index 7e228bf8b..906cd537d 100644 --- a/cognee/shared/data_models.py +++ b/cognee/shared/data_models.py @@ -35,6 +35,7 @@ class ChunkStrategy(Enum): PARAGRAPH = "paragraph" SENTENCE = "sentence" CODE = "code" + LANGCHAIN_CHARACTER = "langchain_character" class MemorySummary(BaseModel): """ Memory summary. """ diff --git a/docs/blog/posts/llmops-and-knowledge-graphs.md b/docs/blog/posts/llmops-and-knowledge-graphs.md new file mode 100644 index 000000000..e69de29bb