Enable different chunking methods

This commit is contained in:
Vasilije 2024-08-08 20:19:40 +02:00
parent 7c7b8a319b
commit 1070a09806
3 changed files with 12 additions and 6 deletions

View file

@ -89,14 +89,16 @@ class AudioDocument(Document):
type: str = "audio"
title: str
file_path: str
chunking_strategy:str
def __init__(self, id: UUID, title: str, file_path: str):
def __init__(self, id: UUID, title: str, file_path: str, chunking_strategy:str="paragraph"):
self.id = id or uuid5(NAMESPACE_OID, title)
self.title = title
self.file_path = file_path
self.chunking_strategy = chunking_strategy
def get_reader(self) -> AudioReader:
reader = AudioReader(self.id, self.file_path)
reader = AudioReader(self.id, self.file_path, self.chunking_strategy)
return reader
def to_dict(self) -> dict:

View file

@ -91,18 +91,20 @@ class PdfDocument(Document):
title: str
num_pages: int
file_path: str
chunking_strategy:str
def __init__(self, id: UUID, title: str, file_path: str):
def __init__(self, id: UUID, title: str, file_path: str, chunking_strategy:str="paragraph"):
self.id = id or uuid5(NAMESPACE_OID, title)
self.title = title
self.file_path = file_path
logging.debug("file_path: %s", self.file_path)
reader = PdfReader(self.id, self.file_path)
self.num_pages = reader.get_number_of_pages()
self.chunking_strategy = chunking_strategy
def get_reader(self) -> PdfReader:
logging.debug("file_path: %s", self.file_path)
reader = PdfReader(self.id, self.file_path)
reader = PdfReader(self.id, self.file_path, self.chunking_strategy)
return reader
def to_dict(self) -> dict:

View file

@ -95,17 +95,19 @@ class TextDocument(Document):
title: str
num_pages: int
file_path: str
chunking_strategy:str
def __init__(self, id: UUID, title: str, file_path: str):
def __init__(self, id: UUID, title: str, file_path: str, chunking_strategy:str="paragraph"):
self.id = id or uuid5(NAMESPACE_OID, title)
self.title = title
self.file_path = file_path
self.chunking_strategy = chunking_strategy
reader = TextReader(self.id, self.file_path)
self.num_pages = reader.get_number_of_pages()
def get_reader(self) -> TextReader:
reader = TextReader(self.id, self.file_path)
reader = TextReader(self.id, self.file_path, self.chunking_strategy)
return reader
def to_dict(self) -> dict: