From a774191ed3153442bbdc29a79e90f45c51bc5cc5 Mon Sep 17 00:00:00 2001
From: Rita Aleksziev <alekszievr@gmail.com>
Date: Tue, 7 Jan 2025 13:38:23 +0100
Subject: [PATCH] Adjust AudioDocument and handle None token limit

---
 .../data/processing/document_types/AudioDocument.py    | 10 +++++++---
 cognee/tasks/chunks/chunk_by_paragraph.py              |  2 ++
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/cognee/modules/data/processing/document_types/AudioDocument.py b/cognee/modules/data/processing/document_types/AudioDocument.py
index 268338703..a59064674 100644
--- a/cognee/modules/data/processing/document_types/AudioDocument.py
+++ b/cognee/modules/data/processing/document_types/AudioDocument.py
@@ -1,6 +1,10 @@
+from typing import Optional
+
 from cognee.infrastructure.llm.get_llm_client import get_llm_client
-from .Document import Document
+
 from .ChunkerMapping import ChunkerConfig
+from .Document import Document
+
 
 class AudioDocument(Document):
     type: str = "audio"
@@ -9,12 +13,12 @@ class AudioDocument(Document):
         result = get_llm_client().create_transcript(self.raw_data_location)
         return(result.text)
 
-    def read(self, chunk_size: int, chunker: str):
+    def read(self, chunk_size: int, chunker: str, embedding_model:Optional[str], max_tokens: Optional[int]):
         # Transcribe the audio file
         
         text = self.create_transcript()
 
         chunker_func = ChunkerConfig.get_chunker(chunker)
-        chunker = chunker_func(self, chunk_size = chunk_size, get_text = lambda: [text])
+        chunker = chunker_func(self, chunk_size = chunk_size, get_text = lambda: [text], embedding_model=embedding_model, max_tokens=max_tokens)
 
         yield from chunker.read()
diff --git a/cognee/tasks/chunks/chunk_by_paragraph.py b/cognee/tasks/chunks/chunk_by_paragraph.py
index 546d4a1a7..2bbd9689f 100644
--- a/cognee/tasks/chunks/chunk_by_paragraph.py
+++ b/cognee/tasks/chunks/chunk_by_paragraph.py
@@ -23,6 +23,8 @@ def chunk_by_paragraph(
     paragraph_ids = []
     last_cut_type = None
     current_token_count = 0
+    if not max_tokens:
+        max_tokens = float("inf")
     
     for paragraph_id, sentence, word_count, end_type in chunk_by_sentence(data, maximum_length=paragraph_length):
         # Check if this sentence would exceed length limit