Enable different chunking methods

This commit is contained in:
Vasilije 2024-08-08 20:19:40 +02:00
parent 7c7b8a319b
commit 1070a09806
3 changed files with 12 additions and 6 deletions

View file

@ -89,14 +89,16 @@ class AudioDocument(Document):
type: str = "audio" type: str = "audio"
title: str title: str
file_path: str file_path: str
chunking_strategy:str
def __init__(self, id: UUID, title: str, file_path: str): def __init__(self, id: UUID, title: str, file_path: str, chunking_strategy:str="paragraph"):
self.id = id or uuid5(NAMESPACE_OID, title) self.id = id or uuid5(NAMESPACE_OID, title)
self.title = title self.title = title
self.file_path = file_path self.file_path = file_path
self.chunking_strategy = chunking_strategy
def get_reader(self) -> AudioReader: def get_reader(self) -> AudioReader:
reader = AudioReader(self.id, self.file_path) reader = AudioReader(self.id, self.file_path, self.chunking_strategy)
return reader return reader
def to_dict(self) -> dict: def to_dict(self) -> dict:

View file

@ -91,18 +91,20 @@ class PdfDocument(Document):
title: str title: str
num_pages: int num_pages: int
file_path: str file_path: str
chunking_strategy:str
def __init__(self, id: UUID, title: str, file_path: str): def __init__(self, id: UUID, title: str, file_path: str, chunking_strategy:str="paragraph"):
self.id = id or uuid5(NAMESPACE_OID, title) self.id = id or uuid5(NAMESPACE_OID, title)
self.title = title self.title = title
self.file_path = file_path self.file_path = file_path
logging.debug("file_path: %s", self.file_path) logging.debug("file_path: %s", self.file_path)
reader = PdfReader(self.id, self.file_path) reader = PdfReader(self.id, self.file_path)
self.num_pages = reader.get_number_of_pages() self.num_pages = reader.get_number_of_pages()
self.chunking_strategy = chunking_strategy
def get_reader(self) -> PdfReader: def get_reader(self) -> PdfReader:
logging.debug("file_path: %s", self.file_path) logging.debug("file_path: %s", self.file_path)
reader = PdfReader(self.id, self.file_path) reader = PdfReader(self.id, self.file_path, self.chunking_strategy)
return reader return reader
def to_dict(self) -> dict: def to_dict(self) -> dict:

View file

@ -95,17 +95,19 @@ class TextDocument(Document):
title: str title: str
num_pages: int num_pages: int
file_path: str file_path: str
chunking_strategy:str
def __init__(self, id: UUID, title: str, file_path: str): def __init__(self, id: UUID, title: str, file_path: str, chunking_strategy:str="paragraph"):
self.id = id or uuid5(NAMESPACE_OID, title) self.id = id or uuid5(NAMESPACE_OID, title)
self.title = title self.title = title
self.file_path = file_path self.file_path = file_path
self.chunking_strategy = chunking_strategy
reader = TextReader(self.id, self.file_path) reader = TextReader(self.id, self.file_path)
self.num_pages = reader.get_number_of_pages() self.num_pages = reader.get_number_of_pages()
def get_reader(self) -> TextReader: def get_reader(self) -> TextReader:
reader = TextReader(self.id, self.file_path) reader = TextReader(self.id, self.file_path, self.chunking_strategy)
return reader return reader
def to_dict(self) -> dict: def to_dict(self) -> dict: