extend langchain splitter with character splitter

This commit is contained in:
Vasilije 2024-06-07 09:53:20 +02:00
parent e69571b306
commit 6bf38e0784
3 changed files with 13 additions and 0 deletions

View file

@ -29,6 +29,9 @@ class LangchainChunkEngine():
if chunk_strategy == ChunkStrategy.CODE:
chunked_data = LangchainChunkEngine.chunk_data_by_code(source_data,chunk_size, chunk_overlap)
elif chunk_strategy == ChunkStrategy.LANGCHAIN_CHARACTER:
chunked_data = LangchainChunkEngine.chunk_data_by_character(source_data,chunk_size, chunk_overlap)
else:
chunked_data = DefaultChunkEngine.chunk_data_by_paragraph(source_data,chunk_size, chunk_overlap)
return chunked_data
@ -50,3 +53,12 @@ class LangchainChunkEngine():
return only_content
def chunk_data_by_character(self, data_chunks, chunk_size, chunk_overlap):
from langchain_text_splitters import RecursiveCharacterTextSplitter
splitter = RecursiveCharacterTextSplitter(chunk_size, chunk_overlap)
data = splitter.split(data_chunks)
only_content = [chunk.page_content for chunk in data]
return only_content

View file

@ -35,6 +35,7 @@ class ChunkStrategy(Enum):
PARAGRAPH = "paragraph"
SENTENCE = "sentence"
CODE = "code"
LANGCHAIN_CHARACTER = "langchain_character"
class MemorySummary(BaseModel):
""" Memory summary. """