Enable different chunking methods
This commit is contained in:
parent
7c7b8a319b
commit
1070a09806
3 changed files with 12 additions and 6 deletions
|
|
@ -89,14 +89,16 @@ class AudioDocument(Document):
|
|||
type: str = "audio"
|
||||
title: str
|
||||
file_path: str
|
||||
chunking_strategy:str
|
||||
|
||||
def __init__(self, id: UUID, title: str, file_path: str):
|
||||
def __init__(self, id: UUID, title: str, file_path: str, chunking_strategy:str="paragraph"):
|
||||
self.id = id or uuid5(NAMESPACE_OID, title)
|
||||
self.title = title
|
||||
self.file_path = file_path
|
||||
self.chunking_strategy = chunking_strategy
|
||||
|
||||
def get_reader(self) -> AudioReader:
|
||||
reader = AudioReader(self.id, self.file_path)
|
||||
reader = AudioReader(self.id, self.file_path, self.chunking_strategy)
|
||||
return reader
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
|
|
|
|||
|
|
@ -91,18 +91,20 @@ class PdfDocument(Document):
|
|||
title: str
|
||||
num_pages: int
|
||||
file_path: str
|
||||
chunking_strategy:str
|
||||
|
||||
def __init__(self, id: UUID, title: str, file_path: str):
|
||||
def __init__(self, id: UUID, title: str, file_path: str, chunking_strategy:str="paragraph"):
|
||||
self.id = id or uuid5(NAMESPACE_OID, title)
|
||||
self.title = title
|
||||
self.file_path = file_path
|
||||
logging.debug("file_path: %s", self.file_path)
|
||||
reader = PdfReader(self.id, self.file_path)
|
||||
self.num_pages = reader.get_number_of_pages()
|
||||
self.chunking_strategy = chunking_strategy
|
||||
|
||||
def get_reader(self) -> PdfReader:
|
||||
logging.debug("file_path: %s", self.file_path)
|
||||
reader = PdfReader(self.id, self.file_path)
|
||||
reader = PdfReader(self.id, self.file_path, self.chunking_strategy)
|
||||
return reader
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
|
|
|
|||
|
|
@ -95,17 +95,19 @@ class TextDocument(Document):
|
|||
title: str
|
||||
num_pages: int
|
||||
file_path: str
|
||||
chunking_strategy:str
|
||||
|
||||
def __init__(self, id: UUID, title: str, file_path: str):
|
||||
def __init__(self, id: UUID, title: str, file_path: str, chunking_strategy:str="paragraph"):
|
||||
self.id = id or uuid5(NAMESPACE_OID, title)
|
||||
self.title = title
|
||||
self.file_path = file_path
|
||||
self.chunking_strategy = chunking_strategy
|
||||
|
||||
reader = TextReader(self.id, self.file_path)
|
||||
self.num_pages = reader.get_number_of_pages()
|
||||
|
||||
def get_reader(self) -> TextReader:
|
||||
reader = TextReader(self.id, self.file_path)
|
||||
reader = TextReader(self.id, self.file_path, self.chunking_strategy)
|
||||
return reader
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue