From 34a9267f414efc9553509bfdbf63bbee6aa5be69 Mon Sep 17 00:00:00 2001
From: Rita Aleksziev <alekszievr@gmail.com>
Date: Wed, 8 Jan 2025 13:23:17 +0100
Subject: [PATCH] Get embedding engine instead of passing it. Get it from
 vector engine instead of direct getter.

---
 cognee/api/v1/cognify/code_graph_pipeline.py  |  2 +-
 cognee/modules/chunking/TextChunker.py        |  6 ++----
 .../document_types/AudioDocument.py           |  4 ++--
 .../processing/document_types/Document.py     |  2 +-
 .../document_types/ImageDocument.py           |  4 ++--
 .../processing/document_types/PdfDocument.py  |  4 ++--
 .../processing/document_types/TextDocument.py |  4 ++--
 .../document_types/UnstructuredDocument.py    |  4 ++--
 cognee/tasks/chunks/chunk_by_paragraph.py     | 19 ++++++++++---------
 .../extract_chunks_from_documents.py          |  3 +--
 10 files changed, 25 insertions(+), 27 deletions(-)

diff --git a/cognee/api/v1/cognify/code_graph_pipeline.py b/cognee/api/v1/cognify/code_graph_pipeline.py
index 2648d0731..7ba461f88 100644
--- a/cognee/api/v1/cognify/code_graph_pipeline.py
+++ b/cognee/api/v1/cognify/code_graph_pipeline.py
@@ -71,7 +71,7 @@ async def run_code_graph_pipeline(repo_path, include_docs=True):
             Task(ingest_data_with_metadata, dataset_name="repo_docs", user=user),
             Task(get_data_list_for_user, dataset_name="repo_docs", user=user),
             Task(classify_documents),
-            Task(extract_chunks_from_documents, embedding_model=embedding_engine.model, max_tokens=8192),
+            Task(extract_chunks_from_documents, max_tokens=8192),
             Task(extract_graph_from_data, graph_model=KnowledgeGraph, task_config={"batch_size": 50}),
             Task(
                 summarize_text,
diff --git a/cognee/modules/chunking/TextChunker.py b/cognee/modules/chunking/TextChunker.py
index 8ef4bfda9..a9cb52bf0 100644
--- a/cognee/modules/chunking/TextChunker.py
+++ b/cognee/modules/chunking/TextChunker.py
@@ -14,13 +14,12 @@ class TextChunker():
     chunk_size = 0
     token_count = 0
 
-    def __init__(self, document, get_text: callable, embedding_model: Optional[str] = None, max_tokens: Optional[int] = None, chunk_size: int = 1024):
+    def __init__(self, document, get_text: callable, max_tokens: Optional[int] = None, chunk_size: int = 1024):
         self.document = document
         self.max_chunk_size = chunk_size
         self.get_text = get_text
         self.max_tokens = max_tokens if max_tokens else float("inf")
-        self.embedding_model = embedding_model
-
+    
     def check_word_count_and_token_count(self, word_count_before, token_count_before, chunk_data):
         word_count_fits = word_count_before + chunk_data["word_count"] <= self.max_chunk_size
         token_count_fits = token_count_before + chunk_data["token_count"] <= self.max_tokens
@@ -31,7 +30,6 @@ class TextChunker():
         for content_text in self.get_text():
             for chunk_data in chunk_by_paragraph(
                 content_text,
-                self.embedding_model,
                 self.max_tokens,
                 self.max_chunk_size,
                 batch_paragraphs = True,
diff --git a/cognee/modules/data/processing/document_types/AudioDocument.py b/cognee/modules/data/processing/document_types/AudioDocument.py
index a59064674..c4e6ae87c 100644
--- a/cognee/modules/data/processing/document_types/AudioDocument.py
+++ b/cognee/modules/data/processing/document_types/AudioDocument.py
@@ -13,12 +13,12 @@ class AudioDocument(Document):
         result = get_llm_client().create_transcript(self.raw_data_location)
         return(result.text)
 
-    def read(self, chunk_size: int, chunker: str, embedding_model:Optional[str], max_tokens: Optional[int]):
+    def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int]):
         # Transcribe the audio file
         
         text = self.create_transcript()
 
         chunker_func = ChunkerConfig.get_chunker(chunker)
-        chunker = chunker_func(self, chunk_size = chunk_size, get_text = lambda: [text], embedding_model=embedding_model, max_tokens=max_tokens)
+        chunker = chunker_func(self, chunk_size = chunk_size, get_text = lambda: [text], max_tokens=max_tokens)
 
         yield from chunker.read()
diff --git a/cognee/modules/data/processing/document_types/Document.py b/cognee/modules/data/processing/document_types/Document.py
index 6712175fb..7c76d3f23 100644
--- a/cognee/modules/data/processing/document_types/Document.py
+++ b/cognee/modules/data/processing/document_types/Document.py
@@ -14,5 +14,5 @@ class Document(DataPoint):
         "type": "Document"
     }
 
-    def read(self, chunk_size: int, embedding_model: Optional[str], max_tokens: Optional[int], chunker = str) -> str:
+    def read(self, chunk_size: int, max_tokens: Optional[int], chunker = str) -> str:
         pass
diff --git a/cognee/modules/data/processing/document_types/ImageDocument.py b/cognee/modules/data/processing/document_types/ImageDocument.py
index 1f4f281f8..ffe8ff3f9 100644
--- a/cognee/modules/data/processing/document_types/ImageDocument.py
+++ b/cognee/modules/data/processing/document_types/ImageDocument.py
@@ -14,11 +14,11 @@ class ImageDocument(Document):
         result = get_llm_client().transcribe_image(self.raw_data_location)
         return(result.choices[0].message.content)
 
-    def read(self, chunk_size: int, chunker: str, embedding_model:Optional[str], max_tokens: Optional[int]):
+    def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int]):
         # Transcribe the image file
         text = self.transcribe_image()
 
         chunker_func = ChunkerConfig.get_chunker(chunker)
-        chunker = chunker_func(self, chunk_size = chunk_size, get_text = lambda: [text], embedding_model=embedding_model, max_tokens=max_tokens)
+        chunker = chunker_func(self, chunk_size = chunk_size, get_text = lambda: [text], max_tokens=max_tokens)
 
         yield from chunker.read()
diff --git a/cognee/modules/data/processing/document_types/PdfDocument.py b/cognee/modules/data/processing/document_types/PdfDocument.py
index 27dadda33..463911d5b 100644
--- a/cognee/modules/data/processing/document_types/PdfDocument.py
+++ b/cognee/modules/data/processing/document_types/PdfDocument.py
@@ -9,7 +9,7 @@ from .Document import Document
 class PdfDocument(Document):
     type: str = "pdf"
 
-    def read(self, chunk_size: int, chunker: str, embedding_model:Optional[str], max_tokens: Optional[int]):
+    def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int]):
         file = PdfReader(self.raw_data_location)
 
         def get_text():
@@ -18,7 +18,7 @@ class PdfDocument(Document):
                 yield page_text
 
         chunker_func = ChunkerConfig.get_chunker(chunker)
-        chunker = chunker_func(self, chunk_size = chunk_size, get_text = get_text, embedding_model=embedding_model, max_tokens=max_tokens)
+        chunker = chunker_func(self, chunk_size = chunk_size, get_text = get_text, max_tokens=max_tokens)
 
         yield from chunker.read()
 
diff --git a/cognee/modules/data/processing/document_types/TextDocument.py b/cognee/modules/data/processing/document_types/TextDocument.py
index 895a6f8b6..582f47737 100644
--- a/cognee/modules/data/processing/document_types/TextDocument.py
+++ b/cognee/modules/data/processing/document_types/TextDocument.py
@@ -7,7 +7,7 @@ from .Document import Document
 class TextDocument(Document):
     type: str = "text"
 
-    def read(self, chunk_size: int, chunker: str, embedding_model:Optional[str], max_tokens: Optional[int]):
+    def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int]):
         def get_text():
             with open(self.raw_data_location, mode = "r", encoding = "utf-8") as file:
                 while True:
@@ -20,6 +20,6 @@ class TextDocument(Document):
 
         chunker_func = ChunkerConfig.get_chunker(chunker)
 
-        chunker = chunker_func(self, chunk_size = chunk_size, get_text = get_text, embedding_model=embedding_model, max_tokens=max_tokens)
+        chunker = chunker_func(self, chunk_size = chunk_size, get_text = get_text, max_tokens=max_tokens)
 
         yield from chunker.read()
diff --git a/cognee/modules/data/processing/document_types/UnstructuredDocument.py b/cognee/modules/data/processing/document_types/UnstructuredDocument.py
index c94ca4a25..6c70744a0 100644
--- a/cognee/modules/data/processing/document_types/UnstructuredDocument.py
+++ b/cognee/modules/data/processing/document_types/UnstructuredDocument.py
@@ -10,7 +10,7 @@ from .Document import Document
 class UnstructuredDocument(Document):
     type: str = "unstructured"
 
-    def read(self, chunk_size: int, chunker: str, embedding_model:Optional[str], max_tokens: Optional[int]) -> str:
+    def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int]) -> str:
         def get_text():
             try:
                 from unstructured.partition.auto import partition
@@ -29,6 +29,6 @@ class UnstructuredDocument(Document):
 
                 yield text
 
-        chunker = TextChunker(self, chunk_size = chunk_size, get_text = get_text, embedding_model=embedding_model, max_tokens=max_tokens)
+        chunker = TextChunker(self, chunk_size = chunk_size, get_text = get_text, max_tokens=max_tokens)
 
         yield from chunker.read()
diff --git a/cognee/tasks/chunks/chunk_by_paragraph.py b/cognee/tasks/chunks/chunk_by_paragraph.py
index b3c191e29..8ab66bd7f 100644
--- a/cognee/tasks/chunks/chunk_by_paragraph.py
+++ b/cognee/tasks/chunks/chunk_by_paragraph.py
@@ -3,12 +3,13 @@ from uuid import NAMESPACE_OID, uuid5
 
 import tiktoken
 
+from cognee.infrastructure.databases.vector import get_vector_engine
+
 from .chunk_by_sentence import chunk_by_sentence
 
 
 def chunk_by_paragraph(
         data: str, 
-        embedding_model: Optional[str], 
         max_tokens: Optional[Union[int, float]], 
         paragraph_length: int = 1024, 
         batch_paragraphs: bool = True
@@ -26,16 +27,16 @@ def chunk_by_paragraph(
     if not max_tokens:
         max_tokens = float("inf")
     
+    vector_engine = get_vector_engine()
+    embedding_model = vector_engine.embedding_engine.model
+
     for paragraph_id, sentence, word_count, end_type in chunk_by_sentence(data, maximum_length=paragraph_length):
         # Check if this sentence would exceed length limit
-        if embedding_model:
-            if embedding_model.startswith("azure/"):
-                embedding_model = embedding_model.split("/")[-1]
-            tokenizer = tiktoken.encoding_for_model(embedding_model)
-            token_count = len(tokenizer.encode(sentence))
-        else:
-            token_count = 0
-
+        
+        embedding_model = embedding_model.split("/")[-1]
+        tokenizer = tiktoken.encoding_for_model(embedding_model)
+        token_count = len(tokenizer.encode(sentence))
+    
         if current_word_count > 0 and (current_word_count + word_count > paragraph_length or current_token_count + token_count > max_tokens):
             # Yield current chunk
             chunk_dict = {
diff --git a/cognee/tasks/documents/extract_chunks_from_documents.py b/cognee/tasks/documents/extract_chunks_from_documents.py
index ddcdb8765..e647afbef 100644
--- a/cognee/tasks/documents/extract_chunks_from_documents.py
+++ b/cognee/tasks/documents/extract_chunks_from_documents.py
@@ -7,9 +7,8 @@ async def extract_chunks_from_documents(
         documents: list[Document], 
         chunk_size: int = 1024, 
         chunker='text_chunker', 
-        embedding_model: Optional[str] = None, 
         max_tokens: Optional[int] = None,
         ):
     for document in documents:
-        for document_chunk in document.read(chunk_size=chunk_size, chunker=chunker, embedding_model=embedding_model, max_tokens=max_tokens):
+        for document_chunk in document.read(chunk_size=chunk_size, chunker=chunker, max_tokens=max_tokens):
             yield document_chunk