Adjust integration tests
This commit is contained in:
parent
6762039711
commit
abb3ea6d21
7 changed files with 10 additions and 10 deletions
|
|
@ -13,7 +13,7 @@ class AudioDocument(Document):
|
||||||
result = get_llm_client().create_transcript(self.raw_data_location)
|
result = get_llm_client().create_transcript(self.raw_data_location)
|
||||||
return result.text
|
return result.text
|
||||||
|
|
||||||
def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int]):
|
def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int] = None):
|
||||||
# Transcribe the audio file
|
# Transcribe the audio file
|
||||||
|
|
||||||
text = self.create_transcript()
|
text = self.create_transcript()
|
||||||
|
|
|
||||||
|
|
@ -11,5 +11,5 @@ class Document(DataPoint):
|
||||||
mime_type: str
|
mime_type: str
|
||||||
_metadata: dict = {"index_fields": ["name"], "type": "Document"}
|
_metadata: dict = {"index_fields": ["name"], "type": "Document"}
|
||||||
|
|
||||||
def read(self, chunk_size: int, max_tokens: Optional[int], chunker=str) -> str:
|
def read(self, chunk_size: int, chunker=str, max_tokens: Optional[int] = None) -> str:
|
||||||
pass
|
pass
|
||||||
|
|
|
||||||
|
|
@ -13,7 +13,7 @@ class ImageDocument(Document):
|
||||||
result = get_llm_client().transcribe_image(self.raw_data_location)
|
result = get_llm_client().transcribe_image(self.raw_data_location)
|
||||||
return result.choices[0].message.content
|
return result.choices[0].message.content
|
||||||
|
|
||||||
def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int]):
|
def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int] = None):
|
||||||
# Transcribe the image file
|
# Transcribe the image file
|
||||||
text = self.transcribe_image()
|
text = self.transcribe_image()
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -9,7 +9,7 @@ from .Document import Document
|
||||||
class PdfDocument(Document):
|
class PdfDocument(Document):
|
||||||
type: str = "pdf"
|
type: str = "pdf"
|
||||||
|
|
||||||
def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int]):
|
def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int] = None):
|
||||||
file = PdfReader(self.raw_data_location)
|
file = PdfReader(self.raw_data_location)
|
||||||
|
|
||||||
def get_text():
|
def get_text():
|
||||||
|
|
|
||||||
|
|
@ -7,7 +7,7 @@ from .Document import Document
|
||||||
class TextDocument(Document):
|
class TextDocument(Document):
|
||||||
type: str = "text"
|
type: str = "text"
|
||||||
|
|
||||||
def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int]):
|
def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int] = None):
|
||||||
def get_text():
|
def get_text():
|
||||||
with open(self.raw_data_location, mode="r", encoding="utf-8") as file:
|
with open(self.raw_data_location, mode="r", encoding="utf-8") as file:
|
||||||
while True:
|
while True:
|
||||||
|
|
|
||||||
|
|
@ -10,7 +10,7 @@ from .Document import Document
|
||||||
class UnstructuredDocument(Document):
|
class UnstructuredDocument(Document):
|
||||||
type: str = "unstructured"
|
type: str = "unstructured"
|
||||||
|
|
||||||
def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int]) -> str:
|
def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int] = None) -> str:
|
||||||
def get_text():
|
def get_text():
|
||||||
try:
|
try:
|
||||||
from unstructured.partition.auto import partition
|
from unstructured.partition.auto import partition
|
||||||
|
|
|
||||||
|
|
@ -68,7 +68,7 @@ def test_UnstructuredDocument():
|
||||||
)
|
)
|
||||||
|
|
||||||
# Test PPTX
|
# Test PPTX
|
||||||
for paragraph_data in pptx_document.read(chunk_size=1024):
|
for paragraph_data in pptx_document.read(chunk_size=1024, chunker="text_chunker"):
|
||||||
assert 19 == paragraph_data.word_count, f" 19 != {paragraph_data.word_count = }"
|
assert 19 == paragraph_data.word_count, f" 19 != {paragraph_data.word_count = }"
|
||||||
assert 104 == len(paragraph_data.text), f" 104 != {len(paragraph_data.text) = }"
|
assert 104 == len(paragraph_data.text), f" 104 != {len(paragraph_data.text) = }"
|
||||||
assert (
|
assert (
|
||||||
|
|
@ -76,7 +76,7 @@ def test_UnstructuredDocument():
|
||||||
), f" sentence_cut != {paragraph_data.cut_type = }"
|
), f" sentence_cut != {paragraph_data.cut_type = }"
|
||||||
|
|
||||||
# Test DOCX
|
# Test DOCX
|
||||||
for paragraph_data in docx_document.read(chunk_size=1024):
|
for paragraph_data in docx_document.read(chunk_size=1024, chunker="text_chunker"):
|
||||||
assert 16 == paragraph_data.word_count, f" 16 != {paragraph_data.word_count = }"
|
assert 16 == paragraph_data.word_count, f" 16 != {paragraph_data.word_count = }"
|
||||||
assert 145 == len(paragraph_data.text), f" 145 != {len(paragraph_data.text) = }"
|
assert 145 == len(paragraph_data.text), f" 145 != {len(paragraph_data.text) = }"
|
||||||
assert (
|
assert (
|
||||||
|
|
@ -84,7 +84,7 @@ def test_UnstructuredDocument():
|
||||||
), f" sentence_end != {paragraph_data.cut_type = }"
|
), f" sentence_end != {paragraph_data.cut_type = }"
|
||||||
|
|
||||||
# TEST CSV
|
# TEST CSV
|
||||||
for paragraph_data in csv_document.read(chunk_size=1024):
|
for paragraph_data in csv_document.read(chunk_size=1024, chunker="text_chunker"):
|
||||||
assert 15 == paragraph_data.word_count, f" 15 != {paragraph_data.word_count = }"
|
assert 15 == paragraph_data.word_count, f" 15 != {paragraph_data.word_count = }"
|
||||||
assert (
|
assert (
|
||||||
"A A A A A A A A A,A A A A A A,A A" == paragraph_data.text
|
"A A A A A A A A A,A A A A A A,A A" == paragraph_data.text
|
||||||
|
|
@ -94,7 +94,7 @@ def test_UnstructuredDocument():
|
||||||
), f" sentence_cut != {paragraph_data.cut_type = }"
|
), f" sentence_cut != {paragraph_data.cut_type = }"
|
||||||
|
|
||||||
# Test XLSX
|
# Test XLSX
|
||||||
for paragraph_data in xlsx_document.read(chunk_size=1024):
|
for paragraph_data in xlsx_document.read(chunk_size=1024, chunker="text_chunker"):
|
||||||
assert 36 == paragraph_data.word_count, f" 36 != {paragraph_data.word_count = }"
|
assert 36 == paragraph_data.word_count, f" 36 != {paragraph_data.word_count = }"
|
||||||
assert 171 == len(paragraph_data.text), f" 171 != {len(paragraph_data.text) = }"
|
assert 171 == len(paragraph_data.text), f" 171 != {len(paragraph_data.text) = }"
|
||||||
assert (
|
assert (
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue