Adjust AudioDocument and handle None token limit
This commit is contained in:
parent
fbf8fc93bf
commit
a774191ed3
2 changed files with 9 additions and 3 deletions
|
|
@ -1,6 +1,10 @@
|
|||
from typing import Optional
|
||||
|
||||
from cognee.infrastructure.llm.get_llm_client import get_llm_client
|
||||
from .Document import Document
|
||||
|
||||
from .ChunkerMapping import ChunkerConfig
|
||||
from .Document import Document
|
||||
|
||||
|
||||
class AudioDocument(Document):
|
||||
type: str = "audio"
|
||||
|
|
@ -9,12 +13,12 @@ class AudioDocument(Document):
|
|||
result = get_llm_client().create_transcript(self.raw_data_location)
|
||||
return(result.text)
|
||||
|
||||
def read(self, chunk_size: int, chunker: str):
|
||||
def read(self, chunk_size: int, chunker: str, embedding_model:Optional[str], max_tokens: Optional[int]):
|
||||
# Transcribe the audio file
|
||||
|
||||
text = self.create_transcript()
|
||||
|
||||
chunker_func = ChunkerConfig.get_chunker(chunker)
|
||||
chunker = chunker_func(self, chunk_size = chunk_size, get_text = lambda: [text])
|
||||
chunker = chunker_func(self, chunk_size = chunk_size, get_text = lambda: [text], embedding_model=embedding_model, max_tokens=max_tokens)
|
||||
|
||||
yield from chunker.read()
|
||||
|
|
|
|||
|
|
@ -23,6 +23,8 @@ def chunk_by_paragraph(
|
|||
paragraph_ids = []
|
||||
last_cut_type = None
|
||||
current_token_count = 0
|
||||
if not max_tokens:
|
||||
max_tokens = float("inf")
|
||||
|
||||
for paragraph_id, sentence, word_count, end_type in chunk_by_sentence(data, maximum_length=paragraph_length):
|
||||
# Check if this sentence would exceed length limit
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue