From 34a9267f414efc9553509bfdbf63bbee6aa5be69 Mon Sep 17 00:00:00 2001 From: Rita Aleksziev Date: Wed, 8 Jan 2025 13:23:17 +0100 Subject: [PATCH] Get embedding engine instead of passing it. Get it from vector engine instead of direct getter. --- cognee/api/v1/cognify/code_graph_pipeline.py | 2 +- cognee/modules/chunking/TextChunker.py | 6 ++---- .../document_types/AudioDocument.py | 4 ++-- .../processing/document_types/Document.py | 2 +- .../document_types/ImageDocument.py | 4 ++-- .../processing/document_types/PdfDocument.py | 4 ++-- .../processing/document_types/TextDocument.py | 4 ++-- .../document_types/UnstructuredDocument.py | 4 ++-- cognee/tasks/chunks/chunk_by_paragraph.py | 19 ++++++++++--------- .../extract_chunks_from_documents.py | 3 +-- 10 files changed, 25 insertions(+), 27 deletions(-) diff --git a/cognee/api/v1/cognify/code_graph_pipeline.py b/cognee/api/v1/cognify/code_graph_pipeline.py index 2648d0731..7ba461f88 100644 --- a/cognee/api/v1/cognify/code_graph_pipeline.py +++ b/cognee/api/v1/cognify/code_graph_pipeline.py @@ -71,7 +71,7 @@ async def run_code_graph_pipeline(repo_path, include_docs=True): Task(ingest_data_with_metadata, dataset_name="repo_docs", user=user), Task(get_data_list_for_user, dataset_name="repo_docs", user=user), Task(classify_documents), - Task(extract_chunks_from_documents, embedding_model=embedding_engine.model, max_tokens=8192), + Task(extract_chunks_from_documents, max_tokens=8192), Task(extract_graph_from_data, graph_model=KnowledgeGraph, task_config={"batch_size": 50}), Task( summarize_text, diff --git a/cognee/modules/chunking/TextChunker.py b/cognee/modules/chunking/TextChunker.py index 8ef4bfda9..a9cb52bf0 100644 --- a/cognee/modules/chunking/TextChunker.py +++ b/cognee/modules/chunking/TextChunker.py @@ -14,13 +14,12 @@ class TextChunker(): chunk_size = 0 token_count = 0 - def __init__(self, document, get_text: callable, embedding_model: Optional[str] = None, max_tokens: Optional[int] = None, chunk_size: int = 1024): + def __init__(self, document, get_text: callable, max_tokens: Optional[int] = None, chunk_size: int = 1024): self.document = document self.max_chunk_size = chunk_size self.get_text = get_text self.max_tokens = max_tokens if max_tokens else float("inf") - self.embedding_model = embedding_model - + def check_word_count_and_token_count(self, word_count_before, token_count_before, chunk_data): word_count_fits = word_count_before + chunk_data["word_count"] <= self.max_chunk_size token_count_fits = token_count_before + chunk_data["token_count"] <= self.max_tokens @@ -31,7 +30,6 @@ class TextChunker(): for content_text in self.get_text(): for chunk_data in chunk_by_paragraph( content_text, - self.embedding_model, self.max_tokens, self.max_chunk_size, batch_paragraphs = True, diff --git a/cognee/modules/data/processing/document_types/AudioDocument.py b/cognee/modules/data/processing/document_types/AudioDocument.py index a59064674..c4e6ae87c 100644 --- a/cognee/modules/data/processing/document_types/AudioDocument.py +++ b/cognee/modules/data/processing/document_types/AudioDocument.py @@ -13,12 +13,12 @@ class AudioDocument(Document): result = get_llm_client().create_transcript(self.raw_data_location) return(result.text) - def read(self, chunk_size: int, chunker: str, embedding_model:Optional[str], max_tokens: Optional[int]): + def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int]): # Transcribe the audio file text = self.create_transcript() chunker_func = ChunkerConfig.get_chunker(chunker) - chunker = chunker_func(self, chunk_size = chunk_size, get_text = lambda: [text], embedding_model=embedding_model, max_tokens=max_tokens) + chunker = chunker_func(self, chunk_size = chunk_size, get_text = lambda: [text], max_tokens=max_tokens) yield from chunker.read() diff --git a/cognee/modules/data/processing/document_types/Document.py b/cognee/modules/data/processing/document_types/Document.py index 6712175fb..7c76d3f23 100644 --- a/cognee/modules/data/processing/document_types/Document.py +++ b/cognee/modules/data/processing/document_types/Document.py @@ -14,5 +14,5 @@ class Document(DataPoint): "type": "Document" } - def read(self, chunk_size: int, embedding_model: Optional[str], max_tokens: Optional[int], chunker = str) -> str: + def read(self, chunk_size: int, max_tokens: Optional[int], chunker = str) -> str: pass diff --git a/cognee/modules/data/processing/document_types/ImageDocument.py b/cognee/modules/data/processing/document_types/ImageDocument.py index 1f4f281f8..ffe8ff3f9 100644 --- a/cognee/modules/data/processing/document_types/ImageDocument.py +++ b/cognee/modules/data/processing/document_types/ImageDocument.py @@ -14,11 +14,11 @@ class ImageDocument(Document): result = get_llm_client().transcribe_image(self.raw_data_location) return(result.choices[0].message.content) - def read(self, chunk_size: int, chunker: str, embedding_model:Optional[str], max_tokens: Optional[int]): + def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int]): # Transcribe the image file text = self.transcribe_image() chunker_func = ChunkerConfig.get_chunker(chunker) - chunker = chunker_func(self, chunk_size = chunk_size, get_text = lambda: [text], embedding_model=embedding_model, max_tokens=max_tokens) + chunker = chunker_func(self, chunk_size = chunk_size, get_text = lambda: [text], max_tokens=max_tokens) yield from chunker.read() diff --git a/cognee/modules/data/processing/document_types/PdfDocument.py b/cognee/modules/data/processing/document_types/PdfDocument.py index 27dadda33..463911d5b 100644 --- a/cognee/modules/data/processing/document_types/PdfDocument.py +++ b/cognee/modules/data/processing/document_types/PdfDocument.py @@ -9,7 +9,7 @@ from .Document import Document class PdfDocument(Document): type: str = "pdf" - def read(self, chunk_size: int, chunker: str, embedding_model:Optional[str], max_tokens: Optional[int]): + def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int]): file = PdfReader(self.raw_data_location) def get_text(): @@ -18,7 +18,7 @@ class PdfDocument(Document): yield page_text chunker_func = ChunkerConfig.get_chunker(chunker) - chunker = chunker_func(self, chunk_size = chunk_size, get_text = get_text, embedding_model=embedding_model, max_tokens=max_tokens) + chunker = chunker_func(self, chunk_size = chunk_size, get_text = get_text, max_tokens=max_tokens) yield from chunker.read() diff --git a/cognee/modules/data/processing/document_types/TextDocument.py b/cognee/modules/data/processing/document_types/TextDocument.py index 895a6f8b6..582f47737 100644 --- a/cognee/modules/data/processing/document_types/TextDocument.py +++ b/cognee/modules/data/processing/document_types/TextDocument.py @@ -7,7 +7,7 @@ from .Document import Document class TextDocument(Document): type: str = "text" - def read(self, chunk_size: int, chunker: str, embedding_model:Optional[str], max_tokens: Optional[int]): + def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int]): def get_text(): with open(self.raw_data_location, mode = "r", encoding = "utf-8") as file: while True: @@ -20,6 +20,6 @@ class TextDocument(Document): chunker_func = ChunkerConfig.get_chunker(chunker) - chunker = chunker_func(self, chunk_size = chunk_size, get_text = get_text, embedding_model=embedding_model, max_tokens=max_tokens) + chunker = chunker_func(self, chunk_size = chunk_size, get_text = get_text, max_tokens=max_tokens) yield from chunker.read() diff --git a/cognee/modules/data/processing/document_types/UnstructuredDocument.py b/cognee/modules/data/processing/document_types/UnstructuredDocument.py index c94ca4a25..6c70744a0 100644 --- a/cognee/modules/data/processing/document_types/UnstructuredDocument.py +++ b/cognee/modules/data/processing/document_types/UnstructuredDocument.py @@ -10,7 +10,7 @@ from .Document import Document class UnstructuredDocument(Document): type: str = "unstructured" - def read(self, chunk_size: int, chunker: str, embedding_model:Optional[str], max_tokens: Optional[int]) -> str: + def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int]) -> str: def get_text(): try: from unstructured.partition.auto import partition @@ -29,6 +29,6 @@ class UnstructuredDocument(Document): yield text - chunker = TextChunker(self, chunk_size = chunk_size, get_text = get_text, embedding_model=embedding_model, max_tokens=max_tokens) + chunker = TextChunker(self, chunk_size = chunk_size, get_text = get_text, max_tokens=max_tokens) yield from chunker.read() diff --git a/cognee/tasks/chunks/chunk_by_paragraph.py b/cognee/tasks/chunks/chunk_by_paragraph.py index b3c191e29..8ab66bd7f 100644 --- a/cognee/tasks/chunks/chunk_by_paragraph.py +++ b/cognee/tasks/chunks/chunk_by_paragraph.py @@ -3,12 +3,13 @@ from uuid import NAMESPACE_OID, uuid5 import tiktoken +from cognee.infrastructure.databases.vector import get_vector_engine + from .chunk_by_sentence import chunk_by_sentence def chunk_by_paragraph( data: str, - embedding_model: Optional[str], max_tokens: Optional[Union[int, float]], paragraph_length: int = 1024, batch_paragraphs: bool = True @@ -26,16 +27,16 @@ def chunk_by_paragraph( if not max_tokens: max_tokens = float("inf") + vector_engine = get_vector_engine() + embedding_model = vector_engine.embedding_engine.model + for paragraph_id, sentence, word_count, end_type in chunk_by_sentence(data, maximum_length=paragraph_length): # Check if this sentence would exceed length limit - if embedding_model: - if embedding_model.startswith("azure/"): - embedding_model = embedding_model.split("/")[-1] - tokenizer = tiktoken.encoding_for_model(embedding_model) - token_count = len(tokenizer.encode(sentence)) - else: - token_count = 0 - + + embedding_model = embedding_model.split("/")[-1] + tokenizer = tiktoken.encoding_for_model(embedding_model) + token_count = len(tokenizer.encode(sentence)) + if current_word_count > 0 and (current_word_count + word_count > paragraph_length or current_token_count + token_count > max_tokens): # Yield current chunk chunk_dict = { diff --git a/cognee/tasks/documents/extract_chunks_from_documents.py b/cognee/tasks/documents/extract_chunks_from_documents.py index ddcdb8765..e647afbef 100644 --- a/cognee/tasks/documents/extract_chunks_from_documents.py +++ b/cognee/tasks/documents/extract_chunks_from_documents.py @@ -7,9 +7,8 @@ async def extract_chunks_from_documents( documents: list[Document], chunk_size: int = 1024, chunker='text_chunker', - embedding_model: Optional[str] = None, max_tokens: Optional[int] = None, ): for document in documents: - for document_chunk in document.read(chunk_size=chunk_size, chunker=chunker, embedding_model=embedding_model, max_tokens=max_tokens): + for document_chunk in document.read(chunk_size=chunk_size, chunker=chunker, max_tokens=max_tokens): yield document_chunk