Enable different chunking methods

2024-08-08 20:19:40 +02:00 · 2024-08-08 20:19:40 +02:00 · 1070a09806
commit 1070a09806
parent 7c7b8a319b
3 changed files with 12 additions and 6 deletions
--- a/cognee/modules/data/processing/document_types/AudioDocument.py
+++ b/cognee/modules/data/processing/document_types/AudioDocument.py
@ -89,14 +89,16 @@ class AudioDocument(Document):
    type: str = "audio"
    title: str
    file_path: str
    chunking_strategy:str
-    def __init__(self, id: UUID, title: str, file_path: str):
+    def __init__(self, id: UUID, title: str, file_path: str, chunking_strategy:str="paragraph"):
        self.id = id or uuid5(NAMESPACE_OID, title)
        self.title = title
        self.file_path = file_path
        self.chunking_strategy = chunking_strategy
    def get_reader(self) -> AudioReader:
-        reader = AudioReader(self.id, self.file_path)
+        reader = AudioReader(self.id, self.file_path, self.chunking_strategy)
        return reader
    def to_dict(self) -> dict:
--- a/cognee/modules/data/processing/document_types/PdfDocument.py
+++ b/cognee/modules/data/processing/document_types/PdfDocument.py
@ -91,18 +91,20 @@ class PdfDocument(Document):
    title: str
    num_pages: int
    file_path: str
    chunking_strategy:str
-    def __init__(self, id: UUID, title: str, file_path: str):
+    def __init__(self, id: UUID, title: str, file_path: str, chunking_strategy:str="paragraph"):
        self.id = id or uuid5(NAMESPACE_OID, title)
        self.title = title
        self.file_path = file_path
        logging.debug("file_path: %s", self.file_path)
        reader = PdfReader(self.id, self.file_path)
        self.num_pages = reader.get_number_of_pages()
        self.chunking_strategy = chunking_strategy
    def get_reader(self) -> PdfReader:
        logging.debug("file_path: %s", self.file_path)
-        reader = PdfReader(self.id, self.file_path)
+        reader = PdfReader(self.id, self.file_path, self.chunking_strategy)
        return reader
    def to_dict(self) -> dict:
--- a/cognee/modules/data/processing/document_types/TextDocument.py
+++ b/cognee/modules/data/processing/document_types/TextDocument.py
@ -95,17 +95,19 @@ class TextDocument(Document):
    title: str
    num_pages: int
    file_path: str
    chunking_strategy:str
-    def __init__(self, id: UUID, title: str, file_path: str):
+    def __init__(self, id: UUID, title: str, file_path: str, chunking_strategy:str="paragraph"):
        self.id = id or uuid5(NAMESPACE_OID, title)
        self.title = title
        self.file_path = file_path
        self.chunking_strategy = chunking_strategy
        reader = TextReader(self.id, self.file_path)
        self.num_pages = reader.get_number_of_pages()
    def get_reader(self) -> TextReader:
-        reader = TextReader(self.id, self.file_path)
+        reader = TextReader(self.id, self.file_path, self.chunking_strategy)
        return reader
    def to_dict(self) -> dict: