Adjust AudioDocument and handle None token limit
This commit is contained in:
parent
fbf8fc93bf
commit
a774191ed3
2 changed files with 9 additions and 3 deletions
|
|
@ -1,6 +1,10 @@
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
from cognee.infrastructure.llm.get_llm_client import get_llm_client
|
from cognee.infrastructure.llm.get_llm_client import get_llm_client
|
||||||
from .Document import Document
|
|
||||||
from .ChunkerMapping import ChunkerConfig
|
from .ChunkerMapping import ChunkerConfig
|
||||||
|
from .Document import Document
|
||||||
|
|
||||||
|
|
||||||
class AudioDocument(Document):
|
class AudioDocument(Document):
|
||||||
type: str = "audio"
|
type: str = "audio"
|
||||||
|
|
@ -9,12 +13,12 @@ class AudioDocument(Document):
|
||||||
result = get_llm_client().create_transcript(self.raw_data_location)
|
result = get_llm_client().create_transcript(self.raw_data_location)
|
||||||
return(result.text)
|
return(result.text)
|
||||||
|
|
||||||
def read(self, chunk_size: int, chunker: str):
|
def read(self, chunk_size: int, chunker: str, embedding_model:Optional[str], max_tokens: Optional[int]):
|
||||||
# Transcribe the audio file
|
# Transcribe the audio file
|
||||||
|
|
||||||
text = self.create_transcript()
|
text = self.create_transcript()
|
||||||
|
|
||||||
chunker_func = ChunkerConfig.get_chunker(chunker)
|
chunker_func = ChunkerConfig.get_chunker(chunker)
|
||||||
chunker = chunker_func(self, chunk_size = chunk_size, get_text = lambda: [text])
|
chunker = chunker_func(self, chunk_size = chunk_size, get_text = lambda: [text], embedding_model=embedding_model, max_tokens=max_tokens)
|
||||||
|
|
||||||
yield from chunker.read()
|
yield from chunker.read()
|
||||||
|
|
|
||||||
|
|
@ -23,6 +23,8 @@ def chunk_by_paragraph(
|
||||||
paragraph_ids = []
|
paragraph_ids = []
|
||||||
last_cut_type = None
|
last_cut_type = None
|
||||||
current_token_count = 0
|
current_token_count = 0
|
||||||
|
if not max_tokens:
|
||||||
|
max_tokens = float("inf")
|
||||||
|
|
||||||
for paragraph_id, sentence, word_count, end_type in chunk_by_sentence(data, maximum_length=paragraph_length):
|
for paragraph_id, sentence, word_count, end_type in chunk_by_sentence(data, maximum_length=paragraph_length):
|
||||||
# Check if this sentence would exceed length limit
|
# Check if this sentence would exceed length limit
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue