From 9e7ab6492a87f18126ccc9ac5a76219c78a19003 Mon Sep 17 00:00:00 2001
From: hajdul88 <52442977+hajdul88@users.noreply.github.com>
Date: Tue, 17 Dec 2024 11:31:31 +0100
Subject: [PATCH] =?UTF-8?q?feat:=20outsources=20chunking=20parameters=20to?=
 =?UTF-8?q?=20extract=20chunk=20from=20documents=20=E2=80=A6=20(#289)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* feat: outsources chunking parameters to extract chunk from documents task
---
 .../processing/document_types/AudioDocument.py    |  7 ++++---
 .../processing/document_types/ChunkerMapping.py   | 15 +++++++++++++++
 .../data/processing/document_types/Document.py    |  4 ++--
 .../processing/document_types/ImageDocument.py    |  7 ++++---
 .../data/processing/document_types/PdfDocument.py |  7 ++++---
 .../processing/document_types/TextDocument.py     |  8 +++++---
 .../documents/extract_chunks_from_documents.py    |  4 ++--
 .../integration/documents/AudioDocument_test.py   |  2 +-
 .../integration/documents/ImageDocument_test.py   |  2 +-
 .../integration/documents/PdfDocument_test.py     |  2 +-
 .../integration/documents/TextDocument_test.py    |  2 +-
 11 files changed, 40 insertions(+), 20 deletions(-)
 create mode 100644 cognee/modules/data/processing/document_types/ChunkerMapping.py

diff --git a/cognee/modules/data/processing/document_types/AudioDocument.py b/cognee/modules/data/processing/document_types/AudioDocument.py
index 0d2cddd3d..268338703 100644
--- a/cognee/modules/data/processing/document_types/AudioDocument.py
+++ b/cognee/modules/data/processing/document_types/AudioDocument.py
@@ -1,6 +1,6 @@
 from cognee.infrastructure.llm.get_llm_client import get_llm_client
-from cognee.modules.chunking.TextChunker import TextChunker
 from .Document import Document
+from .ChunkerMapping import ChunkerConfig
 
 class AudioDocument(Document):
     type: str = "audio"
@@ -9,11 +9,12 @@ class AudioDocument(Document):
         result = get_llm_client().create_transcript(self.raw_data_location)
         return(result.text)
 
-    def read(self, chunk_size: int):
+    def read(self, chunk_size: int, chunker: str):
         # Transcribe the audio file
         
         text = self.create_transcript()
 
-        chunker = TextChunker(self, chunk_size = chunk_size, get_text = lambda: [text])
+        chunker_func = ChunkerConfig.get_chunker(chunker)
+        chunker = chunker_func(self, chunk_size = chunk_size, get_text = lambda: [text])
 
         yield from chunker.read()
diff --git a/cognee/modules/data/processing/document_types/ChunkerMapping.py b/cognee/modules/data/processing/document_types/ChunkerMapping.py
new file mode 100644
index 000000000..14dbb8bb7
--- /dev/null
+++ b/cognee/modules/data/processing/document_types/ChunkerMapping.py
@@ -0,0 +1,15 @@
+from cognee.modules.chunking.TextChunker import TextChunker
+
+class ChunkerConfig:
+    chunker_mapping = {
+        "text_chunker": TextChunker
+    }
+
+    @classmethod
+    def get_chunker(cls, chunker_name: str):
+        chunker_class = cls.chunker_mapping.get(chunker_name)
+        if chunker_class is None:
+            raise NotImplementedError(
+                f"Chunker '{chunker_name}' is not implemented. Available options: {list(cls.chunker_mapping.keys())}"
+            )
+        return chunker_class
\ No newline at end of file
diff --git a/cognee/modules/data/processing/document_types/Document.py b/cognee/modules/data/processing/document_types/Document.py
index 924ffabac..8d6a3dafb 100644
--- a/cognee/modules/data/processing/document_types/Document.py
+++ b/cognee/modules/data/processing/document_types/Document.py
@@ -13,5 +13,5 @@ class Document(DataPoint):
         "type": "Document"
     }
 
-    def read(self, chunk_size: int) -> str:
-        pass
\ No newline at end of file
+    def read(self, chunk_size: int, chunker = str) -> str:
+        pass
diff --git a/cognee/modules/data/processing/document_types/ImageDocument.py b/cognee/modules/data/processing/document_types/ImageDocument.py
index e8f0dd8ee..352486bd8 100644
--- a/cognee/modules/data/processing/document_types/ImageDocument.py
+++ b/cognee/modules/data/processing/document_types/ImageDocument.py
@@ -1,6 +1,6 @@
 from cognee.infrastructure.llm.get_llm_client import get_llm_client
-from cognee.modules.chunking.TextChunker import TextChunker
 from .Document import Document
+from .ChunkerMapping import ChunkerConfig
 
 class ImageDocument(Document):
     type: str = "image"
@@ -10,10 +10,11 @@ class ImageDocument(Document):
         result = get_llm_client().transcribe_image(self.raw_data_location)
         return(result.choices[0].message.content)
 
-    def read(self, chunk_size: int):
+    def read(self, chunk_size: int, chunker: str):
         # Transcribe the image file
         text = self.transcribe_image()
 
-        chunker = TextChunker(self, chunk_size = chunk_size, get_text = lambda: [text])
+        chunker_func = ChunkerConfig.get_chunker(chunker)
+        chunker = chunker_func(self, chunk_size = chunk_size, get_text = lambda: [text])
 
         yield from chunker.read()
diff --git a/cognee/modules/data/processing/document_types/PdfDocument.py b/cognee/modules/data/processing/document_types/PdfDocument.py
index 2d1941996..361214718 100644
--- a/cognee/modules/data/processing/document_types/PdfDocument.py
+++ b/cognee/modules/data/processing/document_types/PdfDocument.py
@@ -1,11 +1,11 @@
 from pypdf import PdfReader
-from cognee.modules.chunking.TextChunker import TextChunker
 from .Document import Document
+from .ChunkerMapping import ChunkerConfig
 
 class PdfDocument(Document):
     type: str = "pdf"
 
-    def read(self, chunk_size: int):
+    def read(self, chunk_size: int, chunker: str):
         file = PdfReader(self.raw_data_location)
 
         def get_text():
@@ -13,7 +13,8 @@ class PdfDocument(Document):
                 page_text = page.extract_text()
                 yield page_text
 
-        chunker = TextChunker(self, chunk_size = chunk_size, get_text = get_text)
+        chunker_func = ChunkerConfig.get_chunker(chunker)
+        chunker = chunker_func(self, chunk_size = chunk_size, get_text = get_text)
 
         yield from chunker.read()
 
diff --git a/cognee/modules/data/processing/document_types/TextDocument.py b/cognee/modules/data/processing/document_types/TextDocument.py
index 32d3416b9..3952d9845 100644
--- a/cognee/modules/data/processing/document_types/TextDocument.py
+++ b/cognee/modules/data/processing/document_types/TextDocument.py
@@ -1,10 +1,10 @@
-from cognee.modules.chunking.TextChunker import TextChunker
 from .Document import Document
+from .ChunkerMapping import ChunkerConfig
 
 class TextDocument(Document):
     type: str = "text"
 
-    def read(self, chunk_size: int):
+    def read(self, chunk_size: int, chunker: str):
         def get_text():
             with open(self.raw_data_location, mode = "r", encoding = "utf-8") as file:
                 while True:
@@ -15,6 +15,8 @@ class TextDocument(Document):
 
                     yield text
 
-        chunker = TextChunker(self, chunk_size = chunk_size, get_text = get_text)
+        chunker_func = ChunkerConfig.get_chunker(chunker)
+
+        chunker = chunker_func(self, chunk_size = chunk_size, get_text = get_text)
 
         yield from chunker.read()
diff --git a/cognee/tasks/documents/extract_chunks_from_documents.py b/cognee/tasks/documents/extract_chunks_from_documents.py
index ec19a786d..423b87b69 100644
--- a/cognee/tasks/documents/extract_chunks_from_documents.py
+++ b/cognee/tasks/documents/extract_chunks_from_documents.py
@@ -1,7 +1,7 @@
 from cognee.modules.data.processing.document_types.Document import Document
 
 
-async def extract_chunks_from_documents(documents: list[Document], chunk_size: int = 1024):
+async def extract_chunks_from_documents(documents: list[Document], chunk_size: int = 1024, chunker = 'text_chunker'):
     for document in documents:
-        for document_chunk in document.read(chunk_size = chunk_size):
+        for document_chunk in document.read(chunk_size = chunk_size, chunker = chunker):
             yield document_chunk
diff --git a/cognee/tests/integration/documents/AudioDocument_test.py b/cognee/tests/integration/documents/AudioDocument_test.py
index da8b85d0b..151f4c0b2 100644
--- a/cognee/tests/integration/documents/AudioDocument_test.py
+++ b/cognee/tests/integration/documents/AudioDocument_test.py
@@ -31,7 +31,7 @@ def test_AudioDocument():
     )
     with patch.object(AudioDocument, "create_transcript", return_value=TEST_TEXT):
         for ground_truth, paragraph_data in zip(
-            GROUND_TRUTH, document.read(chunk_size=64)
+            GROUND_TRUTH, document.read(chunk_size=64, chunker='text_chunker')
         ):
             assert (
                 ground_truth["word_count"] == paragraph_data.word_count
diff --git a/cognee/tests/integration/documents/ImageDocument_test.py b/cognee/tests/integration/documents/ImageDocument_test.py
index 8a8ee8ef3..40e0155af 100644
--- a/cognee/tests/integration/documents/ImageDocument_test.py
+++ b/cognee/tests/integration/documents/ImageDocument_test.py
@@ -21,7 +21,7 @@ def test_ImageDocument():
     with patch.object(ImageDocument, "transcribe_image", return_value=TEST_TEXT):
 
         for ground_truth, paragraph_data in zip(
-            GROUND_TRUTH, document.read(chunk_size=64)
+            GROUND_TRUTH, document.read(chunk_size=64, chunker='text_chunker')
         ):
             assert (
                 ground_truth["word_count"] == paragraph_data.word_count
diff --git a/cognee/tests/integration/documents/PdfDocument_test.py b/cognee/tests/integration/documents/PdfDocument_test.py
index ac57eaf75..25d4cf6c6 100644
--- a/cognee/tests/integration/documents/PdfDocument_test.py
+++ b/cognee/tests/integration/documents/PdfDocument_test.py
@@ -22,7 +22,7 @@ def test_PdfDocument():
     )
 
     for ground_truth, paragraph_data in zip(
-        GROUND_TRUTH, document.read(chunk_size=1024)
+        GROUND_TRUTH, document.read(chunk_size=1024, chunker='text_chunker')
     ):
         assert (
             ground_truth["word_count"] == paragraph_data.word_count
diff --git a/cognee/tests/integration/documents/TextDocument_test.py b/cognee/tests/integration/documents/TextDocument_test.py
index f663418f5..91f38968e 100644
--- a/cognee/tests/integration/documents/TextDocument_test.py
+++ b/cognee/tests/integration/documents/TextDocument_test.py
@@ -33,7 +33,7 @@ def test_TextDocument(input_file, chunk_size):
     )
 
     for ground_truth, paragraph_data in zip(
-        GROUND_TRUTH[input_file], document.read(chunk_size=chunk_size)
+        GROUND_TRUTH[input_file], document.read(chunk_size=chunk_size, chunker='text_chunker')
     ):
         assert (
             ground_truth["word_count"] == paragraph_data.word_count