Enable different chunking methods
This commit is contained in:
parent
7c7b8a319b
commit
1070a09806
3 changed files with 12 additions and 6 deletions
|
|
@ -89,14 +89,16 @@ class AudioDocument(Document):
|
||||||
type: str = "audio"
|
type: str = "audio"
|
||||||
title: str
|
title: str
|
||||||
file_path: str
|
file_path: str
|
||||||
|
chunking_strategy:str
|
||||||
|
|
||||||
def __init__(self, id: UUID, title: str, file_path: str):
|
def __init__(self, id: UUID, title: str, file_path: str, chunking_strategy:str="paragraph"):
|
||||||
self.id = id or uuid5(NAMESPACE_OID, title)
|
self.id = id or uuid5(NAMESPACE_OID, title)
|
||||||
self.title = title
|
self.title = title
|
||||||
self.file_path = file_path
|
self.file_path = file_path
|
||||||
|
self.chunking_strategy = chunking_strategy
|
||||||
|
|
||||||
def get_reader(self) -> AudioReader:
|
def get_reader(self) -> AudioReader:
|
||||||
reader = AudioReader(self.id, self.file_path)
|
reader = AudioReader(self.id, self.file_path, self.chunking_strategy)
|
||||||
return reader
|
return reader
|
||||||
|
|
||||||
def to_dict(self) -> dict:
|
def to_dict(self) -> dict:
|
||||||
|
|
|
||||||
|
|
@ -91,18 +91,20 @@ class PdfDocument(Document):
|
||||||
title: str
|
title: str
|
||||||
num_pages: int
|
num_pages: int
|
||||||
file_path: str
|
file_path: str
|
||||||
|
chunking_strategy:str
|
||||||
|
|
||||||
def __init__(self, id: UUID, title: str, file_path: str):
|
def __init__(self, id: UUID, title: str, file_path: str, chunking_strategy:str="paragraph"):
|
||||||
self.id = id or uuid5(NAMESPACE_OID, title)
|
self.id = id or uuid5(NAMESPACE_OID, title)
|
||||||
self.title = title
|
self.title = title
|
||||||
self.file_path = file_path
|
self.file_path = file_path
|
||||||
logging.debug("file_path: %s", self.file_path)
|
logging.debug("file_path: %s", self.file_path)
|
||||||
reader = PdfReader(self.id, self.file_path)
|
reader = PdfReader(self.id, self.file_path)
|
||||||
self.num_pages = reader.get_number_of_pages()
|
self.num_pages = reader.get_number_of_pages()
|
||||||
|
self.chunking_strategy = chunking_strategy
|
||||||
|
|
||||||
def get_reader(self) -> PdfReader:
|
def get_reader(self) -> PdfReader:
|
||||||
logging.debug("file_path: %s", self.file_path)
|
logging.debug("file_path: %s", self.file_path)
|
||||||
reader = PdfReader(self.id, self.file_path)
|
reader = PdfReader(self.id, self.file_path, self.chunking_strategy)
|
||||||
return reader
|
return reader
|
||||||
|
|
||||||
def to_dict(self) -> dict:
|
def to_dict(self) -> dict:
|
||||||
|
|
|
||||||
|
|
@ -95,17 +95,19 @@ class TextDocument(Document):
|
||||||
title: str
|
title: str
|
||||||
num_pages: int
|
num_pages: int
|
||||||
file_path: str
|
file_path: str
|
||||||
|
chunking_strategy:str
|
||||||
|
|
||||||
def __init__(self, id: UUID, title: str, file_path: str):
|
def __init__(self, id: UUID, title: str, file_path: str, chunking_strategy:str="paragraph"):
|
||||||
self.id = id or uuid5(NAMESPACE_OID, title)
|
self.id = id or uuid5(NAMESPACE_OID, title)
|
||||||
self.title = title
|
self.title = title
|
||||||
self.file_path = file_path
|
self.file_path = file_path
|
||||||
|
self.chunking_strategy = chunking_strategy
|
||||||
|
|
||||||
reader = TextReader(self.id, self.file_path)
|
reader = TextReader(self.id, self.file_path)
|
||||||
self.num_pages = reader.get_number_of_pages()
|
self.num_pages = reader.get_number_of_pages()
|
||||||
|
|
||||||
def get_reader(self) -> TextReader:
|
def get_reader(self) -> TextReader:
|
||||||
reader = TextReader(self.id, self.file_path)
|
reader = TextReader(self.id, self.file_path, self.chunking_strategy)
|
||||||
return reader
|
return reader
|
||||||
|
|
||||||
def to_dict(self) -> dict:
|
def to_dict(self) -> dict:
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue