From abb3ea6d219f8221500fb7a7e7f6cc404cf75b08 Mon Sep 17 00:00:00 2001 From: Rita Aleksziev Date: Thu, 9 Jan 2025 11:31:16 +0100 Subject: [PATCH] Adjust integration tests --- .../data/processing/document_types/AudioDocument.py | 2 +- cognee/modules/data/processing/document_types/Document.py | 2 +- .../data/processing/document_types/ImageDocument.py | 2 +- .../modules/data/processing/document_types/PdfDocument.py | 2 +- .../data/processing/document_types/TextDocument.py | 2 +- .../processing/document_types/UnstructuredDocument.py | 2 +- .../integration/documents/UnstructuredDocument_test.py | 8 ++++---- 7 files changed, 10 insertions(+), 10 deletions(-) diff --git a/cognee/modules/data/processing/document_types/AudioDocument.py b/cognee/modules/data/processing/document_types/AudioDocument.py index faace056b..b7d2476b4 100644 --- a/cognee/modules/data/processing/document_types/AudioDocument.py +++ b/cognee/modules/data/processing/document_types/AudioDocument.py @@ -13,7 +13,7 @@ class AudioDocument(Document): result = get_llm_client().create_transcript(self.raw_data_location) return result.text - def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int]): + def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int] = None): # Transcribe the audio file text = self.create_transcript() diff --git a/cognee/modules/data/processing/document_types/Document.py b/cognee/modules/data/processing/document_types/Document.py index 9a29e7797..7ecdf289e 100644 --- a/cognee/modules/data/processing/document_types/Document.py +++ b/cognee/modules/data/processing/document_types/Document.py @@ -11,5 +11,5 @@ class Document(DataPoint): mime_type: str _metadata: dict = {"index_fields": ["name"], "type": "Document"} - def read(self, chunk_size: int, max_tokens: Optional[int], chunker=str) -> str: + def read(self, chunk_size: int, chunker=str, max_tokens: Optional[int] = None) -> str: pass diff --git a/cognee/modules/data/processing/document_types/ImageDocument.py b/cognee/modules/data/processing/document_types/ImageDocument.py index f0c7a6d61..c055b8253 100644 --- a/cognee/modules/data/processing/document_types/ImageDocument.py +++ b/cognee/modules/data/processing/document_types/ImageDocument.py @@ -13,7 +13,7 @@ class ImageDocument(Document): result = get_llm_client().transcribe_image(self.raw_data_location) return result.choices[0].message.content - def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int]): + def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int] = None): # Transcribe the image file text = self.transcribe_image() diff --git a/cognee/modules/data/processing/document_types/PdfDocument.py b/cognee/modules/data/processing/document_types/PdfDocument.py index 56969c7f8..768f91264 100644 --- a/cognee/modules/data/processing/document_types/PdfDocument.py +++ b/cognee/modules/data/processing/document_types/PdfDocument.py @@ -9,7 +9,7 @@ from .Document import Document class PdfDocument(Document): type: str = "pdf" - def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int]): + def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int] = None): file = PdfReader(self.raw_data_location) def get_text(): diff --git a/cognee/modules/data/processing/document_types/TextDocument.py b/cognee/modules/data/processing/document_types/TextDocument.py index 11dc798aa..b62ccd56e 100644 --- a/cognee/modules/data/processing/document_types/TextDocument.py +++ b/cognee/modules/data/processing/document_types/TextDocument.py @@ -7,7 +7,7 @@ from .Document import Document class TextDocument(Document): type: str = "text" - def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int]): + def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int] = None): def get_text(): with open(self.raw_data_location, mode="r", encoding="utf-8") as file: while True: diff --git a/cognee/modules/data/processing/document_types/UnstructuredDocument.py b/cognee/modules/data/processing/document_types/UnstructuredDocument.py index d6b64498c..1c291d0dc 100644 --- a/cognee/modules/data/processing/document_types/UnstructuredDocument.py +++ b/cognee/modules/data/processing/document_types/UnstructuredDocument.py @@ -10,7 +10,7 @@ from .Document import Document class UnstructuredDocument(Document): type: str = "unstructured" - def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int]) -> str: + def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int] = None) -> str: def get_text(): try: from unstructured.partition.auto import partition diff --git a/cognee/tests/integration/documents/UnstructuredDocument_test.py b/cognee/tests/integration/documents/UnstructuredDocument_test.py index 03b8deb49..e0278de81 100644 --- a/cognee/tests/integration/documents/UnstructuredDocument_test.py +++ b/cognee/tests/integration/documents/UnstructuredDocument_test.py @@ -68,7 +68,7 @@ def test_UnstructuredDocument(): ) # Test PPTX - for paragraph_data in pptx_document.read(chunk_size=1024): + for paragraph_data in pptx_document.read(chunk_size=1024, chunker="text_chunker"): assert 19 == paragraph_data.word_count, f" 19 != {paragraph_data.word_count = }" assert 104 == len(paragraph_data.text), f" 104 != {len(paragraph_data.text) = }" assert ( @@ -76,7 +76,7 @@ def test_UnstructuredDocument(): ), f" sentence_cut != {paragraph_data.cut_type = }" # Test DOCX - for paragraph_data in docx_document.read(chunk_size=1024): + for paragraph_data in docx_document.read(chunk_size=1024, chunker="text_chunker"): assert 16 == paragraph_data.word_count, f" 16 != {paragraph_data.word_count = }" assert 145 == len(paragraph_data.text), f" 145 != {len(paragraph_data.text) = }" assert ( @@ -84,7 +84,7 @@ def test_UnstructuredDocument(): ), f" sentence_end != {paragraph_data.cut_type = }" # TEST CSV - for paragraph_data in csv_document.read(chunk_size=1024): + for paragraph_data in csv_document.read(chunk_size=1024, chunker="text_chunker"): assert 15 == paragraph_data.word_count, f" 15 != {paragraph_data.word_count = }" assert ( "A A A A A A A A A,A A A A A A,A A" == paragraph_data.text @@ -94,7 +94,7 @@ def test_UnstructuredDocument(): ), f" sentence_cut != {paragraph_data.cut_type = }" # Test XLSX - for paragraph_data in xlsx_document.read(chunk_size=1024): + for paragraph_data in xlsx_document.read(chunk_size=1024, chunker="text_chunker"): assert 36 == paragraph_data.word_count, f" 36 != {paragraph_data.word_count = }" assert 171 == len(paragraph_data.text), f" 171 != {len(paragraph_data.text) = }" assert (