Adjust AudioDocument and handle None token limit

2025-01-07 13:38:23 +01:00 · 2025-01-07 13:38:23 +01:00 · a774191ed3
commit a774191ed3
parent fbf8fc93bf
2 changed files with 9 additions and 3 deletions
--- a/cognee/modules/data/processing/document_types/AudioDocument.py
+++ b/cognee/modules/data/processing/document_types/AudioDocument.py
@ -1,6 +1,10 @@
+from typing import Optional
+
 from cognee.infrastructure.llm.get_llm_client import get_llm_client
-from .Document import Document
+
 from .ChunkerMapping import ChunkerConfig
+from .Document import Document
+

 class AudioDocument(Document):
    type: str = "audio"
@ -9,12 +13,12 @@ class AudioDocument(Document):
        result = get_llm_client().create_transcript(self.raw_data_location)
        return(result.text)

-    def read(self, chunk_size: int, chunker: str):
+    def read(self, chunk_size: int, chunker: str, embedding_model:Optional[str], max_tokens: Optional[int]):
        # Transcribe the audio file
        
        text = self.create_transcript()

        chunker_func = ChunkerConfig.get_chunker(chunker)
-        chunker = chunker_func(self, chunk_size = chunk_size, get_text = lambda: [text])
+        chunker = chunker_func(self, chunk_size = chunk_size, get_text = lambda: [text], embedding_model=embedding_model, max_tokens=max_tokens)

        yield from chunker.read()
--- a/cognee/tasks/chunks/chunk_by_paragraph.py
+++ b/cognee/tasks/chunks/chunk_by_paragraph.py
@ -23,6 +23,8 @@ def chunk_by_paragraph(
    paragraph_ids = []
    last_cut_type = None
    current_token_count = 0
+    if not max_tokens:
+        max_tokens = float("inf")
    
    for paragraph_id, sentence, word_count, end_type in chunk_by_sentence(data, maximum_length=paragraph_length):
        # Check if this sentence would exceed length limit