Adjust AudioDocument and handle None token limit

This commit is contained in:
Rita Aleksziev 2025-01-07 13:38:23 +01:00
parent fbf8fc93bf
commit a774191ed3
2 changed files with 9 additions and 3 deletions

View file

@ -1,6 +1,10 @@
from typing import Optional
from cognee.infrastructure.llm.get_llm_client import get_llm_client
from .Document import Document
from .ChunkerMapping import ChunkerConfig
from .Document import Document
class AudioDocument(Document):
type: str = "audio"
@ -9,12 +13,12 @@ class AudioDocument(Document):
result = get_llm_client().create_transcript(self.raw_data_location)
return(result.text)
def read(self, chunk_size: int, chunker: str):
def read(self, chunk_size: int, chunker: str, embedding_model:Optional[str], max_tokens: Optional[int]):
# Transcribe the audio file
text = self.create_transcript()
chunker_func = ChunkerConfig.get_chunker(chunker)
chunker = chunker_func(self, chunk_size = chunk_size, get_text = lambda: [text])
chunker = chunker_func(self, chunk_size = chunk_size, get_text = lambda: [text], embedding_model=embedding_model, max_tokens=max_tokens)
yield from chunker.read()

View file

@ -23,6 +23,8 @@ def chunk_by_paragraph(
paragraph_ids = []
last_cut_type = None
current_token_count = 0
if not max_tokens:
max_tokens = float("inf")
for paragraph_id, sentence, word_count, end_type in chunk_by_sentence(data, maximum_length=paragraph_length):
# Check if this sentence would exceed length limit