From a774191ed3153442bbdc29a79e90f45c51bc5cc5 Mon Sep 17 00:00:00 2001 From: Rita Aleksziev Date: Tue, 7 Jan 2025 13:38:23 +0100 Subject: [PATCH] Adjust AudioDocument and handle None token limit --- .../data/processing/document_types/AudioDocument.py | 10 +++++++--- cognee/tasks/chunks/chunk_by_paragraph.py | 2 ++ 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/cognee/modules/data/processing/document_types/AudioDocument.py b/cognee/modules/data/processing/document_types/AudioDocument.py index 268338703..a59064674 100644 --- a/cognee/modules/data/processing/document_types/AudioDocument.py +++ b/cognee/modules/data/processing/document_types/AudioDocument.py @@ -1,6 +1,10 @@ +from typing import Optional + from cognee.infrastructure.llm.get_llm_client import get_llm_client -from .Document import Document + from .ChunkerMapping import ChunkerConfig +from .Document import Document + class AudioDocument(Document): type: str = "audio" @@ -9,12 +13,12 @@ class AudioDocument(Document): result = get_llm_client().create_transcript(self.raw_data_location) return(result.text) - def read(self, chunk_size: int, chunker: str): + def read(self, chunk_size: int, chunker: str, embedding_model:Optional[str], max_tokens: Optional[int]): # Transcribe the audio file text = self.create_transcript() chunker_func = ChunkerConfig.get_chunker(chunker) - chunker = chunker_func(self, chunk_size = chunk_size, get_text = lambda: [text]) + chunker = chunker_func(self, chunk_size = chunk_size, get_text = lambda: [text], embedding_model=embedding_model, max_tokens=max_tokens) yield from chunker.read() diff --git a/cognee/tasks/chunks/chunk_by_paragraph.py b/cognee/tasks/chunks/chunk_by_paragraph.py index 546d4a1a7..2bbd9689f 100644 --- a/cognee/tasks/chunks/chunk_by_paragraph.py +++ b/cognee/tasks/chunks/chunk_by_paragraph.py @@ -23,6 +23,8 @@ def chunk_by_paragraph( paragraph_ids = [] last_cut_type = None current_token_count = 0 + if not max_tokens: + max_tokens = float("inf") for paragraph_id, sentence, word_count, end_type in chunk_by_sentence(data, maximum_length=paragraph_length): # Check if this sentence would exceed length limit