Get embedding engine instead of passing it. Get it from vector engine instead of direct getter.
This commit is contained in:
parent
f4397bf940
commit
34a9267f41
10 changed files with 25 additions and 27 deletions
|
|
@ -71,7 +71,7 @@ async def run_code_graph_pipeline(repo_path, include_docs=True):
|
||||||
Task(ingest_data_with_metadata, dataset_name="repo_docs", user=user),
|
Task(ingest_data_with_metadata, dataset_name="repo_docs", user=user),
|
||||||
Task(get_data_list_for_user, dataset_name="repo_docs", user=user),
|
Task(get_data_list_for_user, dataset_name="repo_docs", user=user),
|
||||||
Task(classify_documents),
|
Task(classify_documents),
|
||||||
Task(extract_chunks_from_documents, embedding_model=embedding_engine.model, max_tokens=8192),
|
Task(extract_chunks_from_documents, max_tokens=8192),
|
||||||
Task(extract_graph_from_data, graph_model=KnowledgeGraph, task_config={"batch_size": 50}),
|
Task(extract_graph_from_data, graph_model=KnowledgeGraph, task_config={"batch_size": 50}),
|
||||||
Task(
|
Task(
|
||||||
summarize_text,
|
summarize_text,
|
||||||
|
|
|
||||||
|
|
@ -14,13 +14,12 @@ class TextChunker():
|
||||||
chunk_size = 0
|
chunk_size = 0
|
||||||
token_count = 0
|
token_count = 0
|
||||||
|
|
||||||
def __init__(self, document, get_text: callable, embedding_model: Optional[str] = None, max_tokens: Optional[int] = None, chunk_size: int = 1024):
|
def __init__(self, document, get_text: callable, max_tokens: Optional[int] = None, chunk_size: int = 1024):
|
||||||
self.document = document
|
self.document = document
|
||||||
self.max_chunk_size = chunk_size
|
self.max_chunk_size = chunk_size
|
||||||
self.get_text = get_text
|
self.get_text = get_text
|
||||||
self.max_tokens = max_tokens if max_tokens else float("inf")
|
self.max_tokens = max_tokens if max_tokens else float("inf")
|
||||||
self.embedding_model = embedding_model
|
|
||||||
|
|
||||||
def check_word_count_and_token_count(self, word_count_before, token_count_before, chunk_data):
|
def check_word_count_and_token_count(self, word_count_before, token_count_before, chunk_data):
|
||||||
word_count_fits = word_count_before + chunk_data["word_count"] <= self.max_chunk_size
|
word_count_fits = word_count_before + chunk_data["word_count"] <= self.max_chunk_size
|
||||||
token_count_fits = token_count_before + chunk_data["token_count"] <= self.max_tokens
|
token_count_fits = token_count_before + chunk_data["token_count"] <= self.max_tokens
|
||||||
|
|
@ -31,7 +30,6 @@ class TextChunker():
|
||||||
for content_text in self.get_text():
|
for content_text in self.get_text():
|
||||||
for chunk_data in chunk_by_paragraph(
|
for chunk_data in chunk_by_paragraph(
|
||||||
content_text,
|
content_text,
|
||||||
self.embedding_model,
|
|
||||||
self.max_tokens,
|
self.max_tokens,
|
||||||
self.max_chunk_size,
|
self.max_chunk_size,
|
||||||
batch_paragraphs = True,
|
batch_paragraphs = True,
|
||||||
|
|
|
||||||
|
|
@ -13,12 +13,12 @@ class AudioDocument(Document):
|
||||||
result = get_llm_client().create_transcript(self.raw_data_location)
|
result = get_llm_client().create_transcript(self.raw_data_location)
|
||||||
return(result.text)
|
return(result.text)
|
||||||
|
|
||||||
def read(self, chunk_size: int, chunker: str, embedding_model:Optional[str], max_tokens: Optional[int]):
|
def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int]):
|
||||||
# Transcribe the audio file
|
# Transcribe the audio file
|
||||||
|
|
||||||
text = self.create_transcript()
|
text = self.create_transcript()
|
||||||
|
|
||||||
chunker_func = ChunkerConfig.get_chunker(chunker)
|
chunker_func = ChunkerConfig.get_chunker(chunker)
|
||||||
chunker = chunker_func(self, chunk_size = chunk_size, get_text = lambda: [text], embedding_model=embedding_model, max_tokens=max_tokens)
|
chunker = chunker_func(self, chunk_size = chunk_size, get_text = lambda: [text], max_tokens=max_tokens)
|
||||||
|
|
||||||
yield from chunker.read()
|
yield from chunker.read()
|
||||||
|
|
|
||||||
|
|
@ -14,5 +14,5 @@ class Document(DataPoint):
|
||||||
"type": "Document"
|
"type": "Document"
|
||||||
}
|
}
|
||||||
|
|
||||||
def read(self, chunk_size: int, embedding_model: Optional[str], max_tokens: Optional[int], chunker = str) -> str:
|
def read(self, chunk_size: int, max_tokens: Optional[int], chunker = str) -> str:
|
||||||
pass
|
pass
|
||||||
|
|
|
||||||
|
|
@ -14,11 +14,11 @@ class ImageDocument(Document):
|
||||||
result = get_llm_client().transcribe_image(self.raw_data_location)
|
result = get_llm_client().transcribe_image(self.raw_data_location)
|
||||||
return(result.choices[0].message.content)
|
return(result.choices[0].message.content)
|
||||||
|
|
||||||
def read(self, chunk_size: int, chunker: str, embedding_model:Optional[str], max_tokens: Optional[int]):
|
def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int]):
|
||||||
# Transcribe the image file
|
# Transcribe the image file
|
||||||
text = self.transcribe_image()
|
text = self.transcribe_image()
|
||||||
|
|
||||||
chunker_func = ChunkerConfig.get_chunker(chunker)
|
chunker_func = ChunkerConfig.get_chunker(chunker)
|
||||||
chunker = chunker_func(self, chunk_size = chunk_size, get_text = lambda: [text], embedding_model=embedding_model, max_tokens=max_tokens)
|
chunker = chunker_func(self, chunk_size = chunk_size, get_text = lambda: [text], max_tokens=max_tokens)
|
||||||
|
|
||||||
yield from chunker.read()
|
yield from chunker.read()
|
||||||
|
|
|
||||||
|
|
@ -9,7 +9,7 @@ from .Document import Document
|
||||||
class PdfDocument(Document):
|
class PdfDocument(Document):
|
||||||
type: str = "pdf"
|
type: str = "pdf"
|
||||||
|
|
||||||
def read(self, chunk_size: int, chunker: str, embedding_model:Optional[str], max_tokens: Optional[int]):
|
def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int]):
|
||||||
file = PdfReader(self.raw_data_location)
|
file = PdfReader(self.raw_data_location)
|
||||||
|
|
||||||
def get_text():
|
def get_text():
|
||||||
|
|
@ -18,7 +18,7 @@ class PdfDocument(Document):
|
||||||
yield page_text
|
yield page_text
|
||||||
|
|
||||||
chunker_func = ChunkerConfig.get_chunker(chunker)
|
chunker_func = ChunkerConfig.get_chunker(chunker)
|
||||||
chunker = chunker_func(self, chunk_size = chunk_size, get_text = get_text, embedding_model=embedding_model, max_tokens=max_tokens)
|
chunker = chunker_func(self, chunk_size = chunk_size, get_text = get_text, max_tokens=max_tokens)
|
||||||
|
|
||||||
yield from chunker.read()
|
yield from chunker.read()
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -7,7 +7,7 @@ from .Document import Document
|
||||||
class TextDocument(Document):
|
class TextDocument(Document):
|
||||||
type: str = "text"
|
type: str = "text"
|
||||||
|
|
||||||
def read(self, chunk_size: int, chunker: str, embedding_model:Optional[str], max_tokens: Optional[int]):
|
def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int]):
|
||||||
def get_text():
|
def get_text():
|
||||||
with open(self.raw_data_location, mode = "r", encoding = "utf-8") as file:
|
with open(self.raw_data_location, mode = "r", encoding = "utf-8") as file:
|
||||||
while True:
|
while True:
|
||||||
|
|
@ -20,6 +20,6 @@ class TextDocument(Document):
|
||||||
|
|
||||||
chunker_func = ChunkerConfig.get_chunker(chunker)
|
chunker_func = ChunkerConfig.get_chunker(chunker)
|
||||||
|
|
||||||
chunker = chunker_func(self, chunk_size = chunk_size, get_text = get_text, embedding_model=embedding_model, max_tokens=max_tokens)
|
chunker = chunker_func(self, chunk_size = chunk_size, get_text = get_text, max_tokens=max_tokens)
|
||||||
|
|
||||||
yield from chunker.read()
|
yield from chunker.read()
|
||||||
|
|
|
||||||
|
|
@ -10,7 +10,7 @@ from .Document import Document
|
||||||
class UnstructuredDocument(Document):
|
class UnstructuredDocument(Document):
|
||||||
type: str = "unstructured"
|
type: str = "unstructured"
|
||||||
|
|
||||||
def read(self, chunk_size: int, chunker: str, embedding_model:Optional[str], max_tokens: Optional[int]) -> str:
|
def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int]) -> str:
|
||||||
def get_text():
|
def get_text():
|
||||||
try:
|
try:
|
||||||
from unstructured.partition.auto import partition
|
from unstructured.partition.auto import partition
|
||||||
|
|
@ -29,6 +29,6 @@ class UnstructuredDocument(Document):
|
||||||
|
|
||||||
yield text
|
yield text
|
||||||
|
|
||||||
chunker = TextChunker(self, chunk_size = chunk_size, get_text = get_text, embedding_model=embedding_model, max_tokens=max_tokens)
|
chunker = TextChunker(self, chunk_size = chunk_size, get_text = get_text, max_tokens=max_tokens)
|
||||||
|
|
||||||
yield from chunker.read()
|
yield from chunker.read()
|
||||||
|
|
|
||||||
|
|
@ -3,12 +3,13 @@ from uuid import NAMESPACE_OID, uuid5
|
||||||
|
|
||||||
import tiktoken
|
import tiktoken
|
||||||
|
|
||||||
|
from cognee.infrastructure.databases.vector import get_vector_engine
|
||||||
|
|
||||||
from .chunk_by_sentence import chunk_by_sentence
|
from .chunk_by_sentence import chunk_by_sentence
|
||||||
|
|
||||||
|
|
||||||
def chunk_by_paragraph(
|
def chunk_by_paragraph(
|
||||||
data: str,
|
data: str,
|
||||||
embedding_model: Optional[str],
|
|
||||||
max_tokens: Optional[Union[int, float]],
|
max_tokens: Optional[Union[int, float]],
|
||||||
paragraph_length: int = 1024,
|
paragraph_length: int = 1024,
|
||||||
batch_paragraphs: bool = True
|
batch_paragraphs: bool = True
|
||||||
|
|
@ -26,16 +27,16 @@ def chunk_by_paragraph(
|
||||||
if not max_tokens:
|
if not max_tokens:
|
||||||
max_tokens = float("inf")
|
max_tokens = float("inf")
|
||||||
|
|
||||||
|
vector_engine = get_vector_engine()
|
||||||
|
embedding_model = vector_engine.embedding_engine.model
|
||||||
|
|
||||||
for paragraph_id, sentence, word_count, end_type in chunk_by_sentence(data, maximum_length=paragraph_length):
|
for paragraph_id, sentence, word_count, end_type in chunk_by_sentence(data, maximum_length=paragraph_length):
|
||||||
# Check if this sentence would exceed length limit
|
# Check if this sentence would exceed length limit
|
||||||
if embedding_model:
|
|
||||||
if embedding_model.startswith("azure/"):
|
embedding_model = embedding_model.split("/")[-1]
|
||||||
embedding_model = embedding_model.split("/")[-1]
|
tokenizer = tiktoken.encoding_for_model(embedding_model)
|
||||||
tokenizer = tiktoken.encoding_for_model(embedding_model)
|
token_count = len(tokenizer.encode(sentence))
|
||||||
token_count = len(tokenizer.encode(sentence))
|
|
||||||
else:
|
|
||||||
token_count = 0
|
|
||||||
|
|
||||||
if current_word_count > 0 and (current_word_count + word_count > paragraph_length or current_token_count + token_count > max_tokens):
|
if current_word_count > 0 and (current_word_count + word_count > paragraph_length or current_token_count + token_count > max_tokens):
|
||||||
# Yield current chunk
|
# Yield current chunk
|
||||||
chunk_dict = {
|
chunk_dict = {
|
||||||
|
|
|
||||||
|
|
@ -7,9 +7,8 @@ async def extract_chunks_from_documents(
|
||||||
documents: list[Document],
|
documents: list[Document],
|
||||||
chunk_size: int = 1024,
|
chunk_size: int = 1024,
|
||||||
chunker='text_chunker',
|
chunker='text_chunker',
|
||||||
embedding_model: Optional[str] = None,
|
|
||||||
max_tokens: Optional[int] = None,
|
max_tokens: Optional[int] = None,
|
||||||
):
|
):
|
||||||
for document in documents:
|
for document in documents:
|
||||||
for document_chunk in document.read(chunk_size=chunk_size, chunker=chunker, embedding_model=embedding_model, max_tokens=max_tokens):
|
for document_chunk in document.read(chunk_size=chunk_size, chunker=chunker, max_tokens=max_tokens):
|
||||||
yield document_chunk
|
yield document_chunk
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue