From 9e7ab6492a87f18126ccc9ac5a76219c78a19003 Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Tue, 17 Dec 2024 11:31:31 +0100 Subject: [PATCH] =?UTF-8?q?feat:=20outsources=20chunking=20parameters=20to?= =?UTF-8?q?=20extract=20chunk=20from=20documents=20=E2=80=A6=20(#289)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat: outsources chunking parameters to extract chunk from documents task --- .../processing/document_types/AudioDocument.py | 7 ++++--- .../processing/document_types/ChunkerMapping.py | 15 +++++++++++++++ .../data/processing/document_types/Document.py | 4 ++-- .../processing/document_types/ImageDocument.py | 7 ++++--- .../data/processing/document_types/PdfDocument.py | 7 ++++--- .../processing/document_types/TextDocument.py | 8 +++++--- .../documents/extract_chunks_from_documents.py | 4 ++-- .../integration/documents/AudioDocument_test.py | 2 +- .../integration/documents/ImageDocument_test.py | 2 +- .../integration/documents/PdfDocument_test.py | 2 +- .../integration/documents/TextDocument_test.py | 2 +- 11 files changed, 40 insertions(+), 20 deletions(-) create mode 100644 cognee/modules/data/processing/document_types/ChunkerMapping.py diff --git a/cognee/modules/data/processing/document_types/AudioDocument.py b/cognee/modules/data/processing/document_types/AudioDocument.py index 0d2cddd3d..268338703 100644 --- a/cognee/modules/data/processing/document_types/AudioDocument.py +++ b/cognee/modules/data/processing/document_types/AudioDocument.py @@ -1,6 +1,6 @@ from cognee.infrastructure.llm.get_llm_client import get_llm_client -from cognee.modules.chunking.TextChunker import TextChunker from .Document import Document +from .ChunkerMapping import ChunkerConfig class AudioDocument(Document): type: str = "audio" @@ -9,11 +9,12 @@ class AudioDocument(Document): result = get_llm_client().create_transcript(self.raw_data_location) return(result.text) - def read(self, chunk_size: int): + def read(self, chunk_size: int, chunker: str): # Transcribe the audio file text = self.create_transcript() - chunker = TextChunker(self, chunk_size = chunk_size, get_text = lambda: [text]) + chunker_func = ChunkerConfig.get_chunker(chunker) + chunker = chunker_func(self, chunk_size = chunk_size, get_text = lambda: [text]) yield from chunker.read() diff --git a/cognee/modules/data/processing/document_types/ChunkerMapping.py b/cognee/modules/data/processing/document_types/ChunkerMapping.py new file mode 100644 index 000000000..14dbb8bb7 --- /dev/null +++ b/cognee/modules/data/processing/document_types/ChunkerMapping.py @@ -0,0 +1,15 @@ +from cognee.modules.chunking.TextChunker import TextChunker + +class ChunkerConfig: + chunker_mapping = { + "text_chunker": TextChunker + } + + @classmethod + def get_chunker(cls, chunker_name: str): + chunker_class = cls.chunker_mapping.get(chunker_name) + if chunker_class is None: + raise NotImplementedError( + f"Chunker '{chunker_name}' is not implemented. Available options: {list(cls.chunker_mapping.keys())}" + ) + return chunker_class \ No newline at end of file diff --git a/cognee/modules/data/processing/document_types/Document.py b/cognee/modules/data/processing/document_types/Document.py index 924ffabac..8d6a3dafb 100644 --- a/cognee/modules/data/processing/document_types/Document.py +++ b/cognee/modules/data/processing/document_types/Document.py @@ -13,5 +13,5 @@ class Document(DataPoint): "type": "Document" } - def read(self, chunk_size: int) -> str: - pass \ No newline at end of file + def read(self, chunk_size: int, chunker = str) -> str: + pass diff --git a/cognee/modules/data/processing/document_types/ImageDocument.py b/cognee/modules/data/processing/document_types/ImageDocument.py index e8f0dd8ee..352486bd8 100644 --- a/cognee/modules/data/processing/document_types/ImageDocument.py +++ b/cognee/modules/data/processing/document_types/ImageDocument.py @@ -1,6 +1,6 @@ from cognee.infrastructure.llm.get_llm_client import get_llm_client -from cognee.modules.chunking.TextChunker import TextChunker from .Document import Document +from .ChunkerMapping import ChunkerConfig class ImageDocument(Document): type: str = "image" @@ -10,10 +10,11 @@ class ImageDocument(Document): result = get_llm_client().transcribe_image(self.raw_data_location) return(result.choices[0].message.content) - def read(self, chunk_size: int): + def read(self, chunk_size: int, chunker: str): # Transcribe the image file text = self.transcribe_image() - chunker = TextChunker(self, chunk_size = chunk_size, get_text = lambda: [text]) + chunker_func = ChunkerConfig.get_chunker(chunker) + chunker = chunker_func(self, chunk_size = chunk_size, get_text = lambda: [text]) yield from chunker.read() diff --git a/cognee/modules/data/processing/document_types/PdfDocument.py b/cognee/modules/data/processing/document_types/PdfDocument.py index 2d1941996..361214718 100644 --- a/cognee/modules/data/processing/document_types/PdfDocument.py +++ b/cognee/modules/data/processing/document_types/PdfDocument.py @@ -1,11 +1,11 @@ from pypdf import PdfReader -from cognee.modules.chunking.TextChunker import TextChunker from .Document import Document +from .ChunkerMapping import ChunkerConfig class PdfDocument(Document): type: str = "pdf" - def read(self, chunk_size: int): + def read(self, chunk_size: int, chunker: str): file = PdfReader(self.raw_data_location) def get_text(): @@ -13,7 +13,8 @@ class PdfDocument(Document): page_text = page.extract_text() yield page_text - chunker = TextChunker(self, chunk_size = chunk_size, get_text = get_text) + chunker_func = ChunkerConfig.get_chunker(chunker) + chunker = chunker_func(self, chunk_size = chunk_size, get_text = get_text) yield from chunker.read() diff --git a/cognee/modules/data/processing/document_types/TextDocument.py b/cognee/modules/data/processing/document_types/TextDocument.py index 32d3416b9..3952d9845 100644 --- a/cognee/modules/data/processing/document_types/TextDocument.py +++ b/cognee/modules/data/processing/document_types/TextDocument.py @@ -1,10 +1,10 @@ -from cognee.modules.chunking.TextChunker import TextChunker from .Document import Document +from .ChunkerMapping import ChunkerConfig class TextDocument(Document): type: str = "text" - def read(self, chunk_size: int): + def read(self, chunk_size: int, chunker: str): def get_text(): with open(self.raw_data_location, mode = "r", encoding = "utf-8") as file: while True: @@ -15,6 +15,8 @@ class TextDocument(Document): yield text - chunker = TextChunker(self, chunk_size = chunk_size, get_text = get_text) + chunker_func = ChunkerConfig.get_chunker(chunker) + + chunker = chunker_func(self, chunk_size = chunk_size, get_text = get_text) yield from chunker.read() diff --git a/cognee/tasks/documents/extract_chunks_from_documents.py b/cognee/tasks/documents/extract_chunks_from_documents.py index ec19a786d..423b87b69 100644 --- a/cognee/tasks/documents/extract_chunks_from_documents.py +++ b/cognee/tasks/documents/extract_chunks_from_documents.py @@ -1,7 +1,7 @@ from cognee.modules.data.processing.document_types.Document import Document -async def extract_chunks_from_documents(documents: list[Document], chunk_size: int = 1024): +async def extract_chunks_from_documents(documents: list[Document], chunk_size: int = 1024, chunker = 'text_chunker'): for document in documents: - for document_chunk in document.read(chunk_size = chunk_size): + for document_chunk in document.read(chunk_size = chunk_size, chunker = chunker): yield document_chunk diff --git a/cognee/tests/integration/documents/AudioDocument_test.py b/cognee/tests/integration/documents/AudioDocument_test.py index da8b85d0b..151f4c0b2 100644 --- a/cognee/tests/integration/documents/AudioDocument_test.py +++ b/cognee/tests/integration/documents/AudioDocument_test.py @@ -31,7 +31,7 @@ def test_AudioDocument(): ) with patch.object(AudioDocument, "create_transcript", return_value=TEST_TEXT): for ground_truth, paragraph_data in zip( - GROUND_TRUTH, document.read(chunk_size=64) + GROUND_TRUTH, document.read(chunk_size=64, chunker='text_chunker') ): assert ( ground_truth["word_count"] == paragraph_data.word_count diff --git a/cognee/tests/integration/documents/ImageDocument_test.py b/cognee/tests/integration/documents/ImageDocument_test.py index 8a8ee8ef3..40e0155af 100644 --- a/cognee/tests/integration/documents/ImageDocument_test.py +++ b/cognee/tests/integration/documents/ImageDocument_test.py @@ -21,7 +21,7 @@ def test_ImageDocument(): with patch.object(ImageDocument, "transcribe_image", return_value=TEST_TEXT): for ground_truth, paragraph_data in zip( - GROUND_TRUTH, document.read(chunk_size=64) + GROUND_TRUTH, document.read(chunk_size=64, chunker='text_chunker') ): assert ( ground_truth["word_count"] == paragraph_data.word_count diff --git a/cognee/tests/integration/documents/PdfDocument_test.py b/cognee/tests/integration/documents/PdfDocument_test.py index ac57eaf75..25d4cf6c6 100644 --- a/cognee/tests/integration/documents/PdfDocument_test.py +++ b/cognee/tests/integration/documents/PdfDocument_test.py @@ -22,7 +22,7 @@ def test_PdfDocument(): ) for ground_truth, paragraph_data in zip( - GROUND_TRUTH, document.read(chunk_size=1024) + GROUND_TRUTH, document.read(chunk_size=1024, chunker='text_chunker') ): assert ( ground_truth["word_count"] == paragraph_data.word_count diff --git a/cognee/tests/integration/documents/TextDocument_test.py b/cognee/tests/integration/documents/TextDocument_test.py index f663418f5..91f38968e 100644 --- a/cognee/tests/integration/documents/TextDocument_test.py +++ b/cognee/tests/integration/documents/TextDocument_test.py @@ -33,7 +33,7 @@ def test_TextDocument(input_file, chunk_size): ) for ground_truth, paragraph_data in zip( - GROUND_TRUTH[input_file], document.read(chunk_size=chunk_size) + GROUND_TRUTH[input_file], document.read(chunk_size=chunk_size, chunker='text_chunker') ): assert ( ground_truth["word_count"] == paragraph_data.word_count