From a8a83fffff17c882c554e1a6ea95481de773c6b1 Mon Sep 17 00:00:00 2001
From: Rita Aleksziev <alekszievr@gmail.com>
Date: Fri, 20 Dec 2024 10:53:57 +0100
Subject: [PATCH 01/24] Ingest non-code files

---
 cognee/api/v1/cognify/code_graph_pipeline.py  | 43 ++++++++++++++++---
 cognee/tasks/repo_processor/__init__.py       |  1 +
 .../repo_processor/get_non_code_files.py      | 36 ++++++++++++++++
 examples/python/code_graph_example.py         | 11 ++---
 4 files changed, 80 insertions(+), 11 deletions(-)
 create mode 100644 cognee/tasks/repo_processor/get_non_code_files.py

diff --git a/cognee/api/v1/cognify/code_graph_pipeline.py b/cognee/api/v1/cognify/code_graph_pipeline.py
index c35f9719f..8e92d08e0 100644
--- a/cognee/api/v1/cognify/code_graph_pipeline.py
+++ b/cognee/api/v1/cognify/code_graph_pipeline.py
@@ -2,29 +2,37 @@ import asyncio
 import logging
 from pathlib import Path
 
+from cognee.base_config import get_base_config
+from cognee.modules.cognify.config import get_cognify_config
 from cognee.modules.pipelines import run_tasks
 from cognee.modules.pipelines.tasks.Task import Task
+from cognee.modules.users.methods import get_default_user
+from cognee.shared.data_models import KnowledgeGraph, MonitoringTool
+from cognee.tasks.documents import (classify_documents,
+                                    extract_chunks_from_documents)
+from cognee.tasks.graph import extract_graph_from_data
+from cognee.tasks.ingestion import ingest_data_with_metadata
 from cognee.tasks.repo_processor import (enrich_dependency_graph,
                                          expand_dependency_graph,
+                                         get_data_list_for_user,
+                                         get_non_code_files,
                                          get_repo_file_dependencies)
 from cognee.tasks.storage import add_data_points
 
-from cognee.base_config import get_base_config
-from cognee.shared.data_models import MonitoringTool
-
 monitoring = get_base_config().monitoring_tool
 if monitoring == MonitoringTool.LANGFUSE:
     from langfuse.decorators import observe
 
-from cognee.tasks.summarization import summarize_code
+from cognee.tasks.summarization import summarize_code, summarize_text
 
 logger = logging.getLogger("code_graph_pipeline")
 update_status_lock = asyncio.Lock()
 
 @observe
-async def run_code_graph_pipeline(repo_path):
+async def run_code_graph_pipeline(repo_path, include_docs=True):
     import os
     import pathlib
+
     import cognee
     from cognee.infrastructure.databases.relational import create_db_and_tables
 
@@ -38,6 +46,9 @@ async def run_code_graph_pipeline(repo_path):
     await cognee.prune.prune_system(metadata=True)
     await create_db_and_tables()
 
+    cognee_config = get_cognify_config()
+    user = await get_default_user()
+
     tasks = [
         Task(get_repo_file_dependencies),
         Task(enrich_dependency_graph, task_config={"batch_size": 50}),
@@ -46,4 +57,24 @@ async def run_code_graph_pipeline(repo_path):
         Task(add_data_points, task_config={"batch_size": 50}),
     ]
 
-    return run_tasks(tasks, repo_path, "cognify_code_pipeline")
+    if include_docs:
+        non_code_tasks = [
+            Task(get_non_code_files, task_config={"batch_size": 50}),
+            Task(ingest_data_with_metadata, dataset_name="repo_docs", user=user),
+            Task(get_data_list_for_user, dataset_name="repo_docs", user=user),
+            Task(classify_documents),
+            Task(extract_chunks_from_documents),
+            Task(extract_graph_from_data, graph_model=KnowledgeGraph, task_config={"batch_size": 50}),
+            Task(
+                summarize_text,
+                summarization_model=cognee_config.summarization_model,
+                task_config={"batch_size": 50}
+            ),
+        ]
+
+    if include_docs:
+        async for result in run_tasks(non_code_tasks, repo_path):
+            yield result
+
+    async for result in run_tasks(tasks, repo_path, "cognify_code_pipeline"):
+        yield result
\ No newline at end of file
diff --git a/cognee/tasks/repo_processor/__init__.py b/cognee/tasks/repo_processor/__init__.py
index 05e111b29..fa754028e 100644
--- a/cognee/tasks/repo_processor/__init__.py
+++ b/cognee/tasks/repo_processor/__init__.py
@@ -4,4 +4,5 @@ logger = logging.getLogger("task:repo_processor")
 
 from .enrich_dependency_graph import enrich_dependency_graph
 from .expand_dependency_graph import expand_dependency_graph
+from .get_non_code_files import get_data_list_for_user, get_non_py_files
 from .get_repo_file_dependencies import get_repo_file_dependencies
diff --git a/cognee/tasks/repo_processor/get_non_code_files.py b/cognee/tasks/repo_processor/get_non_code_files.py
new file mode 100644
index 000000000..5a8a34f64
--- /dev/null
+++ b/cognee/tasks/repo_processor/get_non_code_files.py
@@ -0,0 +1,36 @@
+import os
+
+import aiofiles
+
+import cognee.modules.ingestion as ingestion
+from cognee.infrastructure.engine import DataPoint
+from cognee.modules.data.methods import get_datasets
+from cognee.modules.data.methods.get_dataset_data import get_dataset_data
+from cognee.modules.data.methods.get_datasets_by_name import \
+    get_datasets_by_name
+from cognee.modules.data.models import Data
+from cognee.modules.data.operations.write_metadata import write_metadata
+from cognee.modules.ingestion.data_types import BinaryData
+from cognee.modules.users.methods import get_default_user
+from cognee.shared.CodeGraphEntities import Repository
+
+
+async def get_non_py_files(repo_path):
+    """Get files that are not .py files and their contents"""
+    if not os.path.exists(repo_path):
+        return {}
+
+    non_py_files_paths = [
+        os.path.join(root, file)
+        for root, _, files in os.walk(repo_path) for file in files if not file.endswith(".py")
+    ]
+    return non_py_files_paths
+
+
+async def get_data_list_for_user(_, dataset_name, user):
+    datasets = await get_datasets_by_name(dataset_name, user.id)
+    data_documents: list[Data] = []
+    for dataset in datasets:
+        data_docs: list[Data] = await get_dataset_data(dataset_id=dataset.id)
+        data_documents.extend(data_docs)
+    return data_documents
\ No newline at end of file
diff --git a/examples/python/code_graph_example.py b/examples/python/code_graph_example.py
index 9189de46c..c0b91972b 100644
--- a/examples/python/code_graph_example.py
+++ b/examples/python/code_graph_example.py
@@ -1,15 +1,16 @@
 import argparse
 import asyncio
+
 from cognee.api.v1.cognify.code_graph_pipeline import run_code_graph_pipeline
 
 
-async def main(repo_path):
-    async for result in await run_code_graph_pipeline(repo_path):
+async def main(repo_path, include_docs):
+    async for result in run_code_graph_pipeline(repo_path, include_docs):
         print(result)
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument("--repo-path", type=str, required=True, help="Path to the repository")
+    parser.add_argument("--repo_path", type=str, required=True, help="Path to the repository")
+    parser.add_argument("--include_docs", type=bool, default=True, help="Whether or not to process non-code files")
     args = parser.parse_args()
-    asyncio.run(main(args.repo_path))
-
+    asyncio.run(main(args.repo_path, args.include_docs))
\ No newline at end of file

From 399faf9ca0e445e477e7c8e2201ec3478d62e928 Mon Sep 17 00:00:00 2001
From: Rita Aleksziev <alekszievr@gmail.com>
Date: Fri, 20 Dec 2024 13:37:24 +0100
Subject: [PATCH 02/24] Fixing review findings

---
 cognee/tasks/repo_processor/get_non_code_files.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/cognee/tasks/repo_processor/get_non_code_files.py b/cognee/tasks/repo_processor/get_non_code_files.py
index 5a8a34f64..671b998d9 100644
--- a/cognee/tasks/repo_processor/get_non_code_files.py
+++ b/cognee/tasks/repo_processor/get_non_code_files.py
@@ -20,14 +20,26 @@ async def get_non_py_files(repo_path):
     if not os.path.exists(repo_path):
         return {}
 
+    IGNORED_PATTERNS = {
+        '.git', '__pycache__', '*.pyc', '*.pyo', '*.pyd',
+        'node_modules', '*.egg-info'
+    }
+
+    def should_process(path):
+        return not any(pattern in path for pattern in IGNORED_PATTERNS)
+
     non_py_files_paths = [
         os.path.join(root, file)
-        for root, _, files in os.walk(repo_path) for file in files if not file.endswith(".py")
+        for root, _, files in os.walk(repo_path) for file in files 
+        if not file.endswith(".py") and should_process(os.path.join(root, file))
     ]
     return non_py_files_paths
 
 
 async def get_data_list_for_user(_, dataset_name, user):
+    # Note: This method is meant to be used as a Task in a pipeline.
+    # By the nature of pipelines, the output of the previous Task will be passed as the first argument here,
+    # but it is not needed here, hence the "_" input.
     datasets = await get_datasets_by_name(dataset_name, user.id)
     data_documents: list[Data] = []
     for dataset in datasets:

From 4cee9a16ce16048aedf4678201ed16b5ce22273b Mon Sep 17 00:00:00 2001
From: lxobr <122801072+lxobr@users.noreply.github.com>
Date: Mon, 6 Jan 2025 11:22:45 +0100
Subject: [PATCH 03/24] fix: add allowed extensions

---
 .../repo_processor/get_non_code_files.py      | 21 ++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/cognee/tasks/repo_processor/get_non_code_files.py b/cognee/tasks/repo_processor/get_non_code_files.py
index 671b998d9..f060782b6 100644
--- a/cognee/tasks/repo_processor/get_non_code_files.py
+++ b/cognee/tasks/repo_processor/get_non_code_files.py
@@ -25,8 +25,27 @@ async def get_non_py_files(repo_path):
         'node_modules', '*.egg-info'
     }
 
+    ALLOWED_EXTENSIONS = {
+        '.txt', '.md', '.csv', '.json', '.xml', '.yaml', '.yml', '.html', 
+        '.css', '.js', '.ts', '.jsx', '.tsx', '.sql', '.log', '.ini', 
+        '.toml', '.properties', '.sh', '.bash', '.dockerfile', '.gitignore', 
+        '.gitattributes', '.makefile', '.pyproject', '.requirements', 
+        '.env', '.pdf', '.doc', '.docx', '.dot', '.dotx', '.rtf', 
+        '.wps', '.wpd', '.odt', '.ott', '.ottx', '.txt', '.wp', 
+        '.sdw', '.sdx', '.docm', '.dotm', 
+        # Additional extensions for other programming languages
+        '.java', '.c', '.cpp', '.h', '.cs', '.go', '.php', '.rb', 
+        '.swift', '.pl', '.lua', '.rs', '.scala', '.kt', '.sh', 
+        '.sql', '.v', '.asm', '.pas', '.d', '.ml', '.clj', '.cljs', 
+        '.erl', '.ex', '.exs', '.f', '.fs', '.r', '.pyi', 
+        '.pdb', '.ipynb', '.rmd', '.cabal', '.hs', '.nim', 
+        '.vhdl', '.verilog', '.svelte', '.html', '.css', '.scss', 
+        '.less', '.json5', '.yaml', '.yml'
+    }
+
     def should_process(path):
-        return not any(pattern in path for pattern in IGNORED_PATTERNS)
+        _, ext = os.path.splitext(path)
+        return ext in ALLOWED_EXTENSIONS and not any(pattern in path for pattern in IGNORED_PATTERNS)
 
     non_py_files_paths = [
         os.path.join(root, file)

From dbc33a6478944991f93223a3e443c3acf12460a7 Mon Sep 17 00:00:00 2001
From: lxobr <122801072+lxobr@users.noreply.github.com>
Date: Mon, 6 Jan 2025 11:23:55 +0100
Subject: [PATCH 04/24] fix: adhere UnstructuredDocument.read() to Document

---
 .../data/processing/document_types/UnstructuredDocument.py      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cognee/modules/data/processing/document_types/UnstructuredDocument.py b/cognee/modules/data/processing/document_types/UnstructuredDocument.py
index 62632cd08..8da065ff5 100644
--- a/cognee/modules/data/processing/document_types/UnstructuredDocument.py
+++ b/cognee/modules/data/processing/document_types/UnstructuredDocument.py
@@ -8,7 +8,7 @@ from cognee.modules.data.exceptions import UnstructuredLibraryImportError
 class UnstructuredDocument(Document):
     type: str = "unstructured"
 
-    def read(self, chunk_size: int):
+    def read(self, chunk_size: int, chunker = str) -> str:
         def get_text():
             try:
                 from unstructured.partition.auto import partition

From 5e79dc53c55405217925c5a511a64efccb67578c Mon Sep 17 00:00:00 2001
From: lxobr <122801072+lxobr@users.noreply.github.com>
Date: Mon, 6 Jan 2025 11:25:04 +0100
Subject: [PATCH 05/24] feat: time code graph run and add mock support

---
 examples/python/code_graph_example.py | 62 ++++++++++++++++++++++++---
 1 file changed, 57 insertions(+), 5 deletions(-)

diff --git a/examples/python/code_graph_example.py b/examples/python/code_graph_example.py
index 44ab33aad..9cd9f99c4 100644
--- a/examples/python/code_graph_example.py
+++ b/examples/python/code_graph_example.py
@@ -8,9 +8,61 @@ async def main(repo_path, include_docs):
     async for result in run_code_graph_pipeline(repo_path, include_docs):
         print(result)
 
-if __name__ == "__main__":
+def parse_args():
     parser = argparse.ArgumentParser()
-    parser.add_argument("--repo_path", type=str, required=True, help="Path to the repository")
-    parser.add_argument("--include_docs", type=lambda x: x.lower() in ("true", "1"), default=True, help="Whether or not to process non-code files")
-    args = parser.parse_args()
-    asyncio.run(main(args.repo_path, args.include_docs))
\ No newline at end of file
+    parser.add_argument(
+        "--repo_path", 
+        type=str, 
+        required=True, 
+        help="Path to the repository"
+    )
+    parser.add_argument(
+        "--include_docs",
+        type=lambda x: x.lower() in ("true", "1"),
+        default=True,
+        help="Whether or not to process non-code files"
+    )
+    parser.add_argument(
+        "--mock_embedding",
+        type=lambda x: x.lower() in ("true", "1"), 
+        default=True,
+        help="Whether or not to mock embedding and code summary"
+    )
+    parser.add_argument(
+        "--mock_code_summary",
+        type=lambda x: x.lower() in ("true", "1"),
+        default=True, 
+        help="Whether or not to mock code summary"
+    )
+    parser.add_argument(
+        "--time",
+        type=lambda x: x.lower() in ("true", "1"),
+        default=True,
+        help="Whether or not to time the pipeline run"
+    )
+    return parser.parse_args()
+
+if __name__ == "__main__":
+    import os
+
+    args = parse_args()
+    
+    if args.mock_embedding:
+        os.environ["MOCK_EMBEDDING"] = "true"
+        print("Mocking embedding.")
+    
+    if args.mock_code_summary:
+        os.environ["MOCK_CODE_SUMMARY"] = "true"
+        print("Mocking code summary.")
+    
+    if args.time:
+        import time
+        start_time = time.time()
+        asyncio.run(main(args.repo_path, args.include_docs))
+        end_time = time.time()
+        print("\n" + "="*50)
+        print(f"Pipeline Execution Time: {end_time - start_time:.2f} seconds")
+        print("="*50 + "\n")
+    else:
+        asyncio.run(main(args.repo_path, args.include_docs))
+        
\ No newline at end of file

From 4802567871d34b3926f2e3dfd33ea9655dedcb44 Mon Sep 17 00:00:00 2001
From: alekszievr <44192193+alekszievr@users.noreply.github.com>
Date: Tue, 7 Jan 2025 11:46:46 +0100
Subject: [PATCH 06/24] Overcome ContextWindowExceededError by checking token
 count while chunking (#413)

---
 cognee/api/v1/cognify/code_graph_pipeline.py  |  2 +-
 cognee/modules/chunking/TextChunker.py        | 23 +++++++++++---
 .../processing/document_types/Document.py     |  3 +-
 .../document_types/ImageDocument.py           | 10 ++++--
 .../processing/document_types/PdfDocument.py  | 10 ++++--
 .../processing/document_types/TextDocument.py |  9 ++++--
 .../document_types/UnstructuredDocument.py    |  8 +++--
 cognee/tasks/chunks/chunk_by_paragraph.py     | 31 ++++++++++++++++---
 .../extract_chunks_from_documents.py          | 12 +++++--
 9 files changed, 84 insertions(+), 24 deletions(-)

diff --git a/cognee/api/v1/cognify/code_graph_pipeline.py b/cognee/api/v1/cognify/code_graph_pipeline.py
index 3d31b4000..2648d0731 100644
--- a/cognee/api/v1/cognify/code_graph_pipeline.py
+++ b/cognee/api/v1/cognify/code_graph_pipeline.py
@@ -71,7 +71,7 @@ async def run_code_graph_pipeline(repo_path, include_docs=True):
             Task(ingest_data_with_metadata, dataset_name="repo_docs", user=user),
             Task(get_data_list_for_user, dataset_name="repo_docs", user=user),
             Task(classify_documents),
-            Task(extract_chunks_from_documents),
+            Task(extract_chunks_from_documents, embedding_model=embedding_engine.model, max_tokens=8192),
             Task(extract_graph_from_data, graph_model=KnowledgeGraph, task_config={"batch_size": 50}),
             Task(
                 summarize_text,
diff --git a/cognee/modules/chunking/TextChunker.py b/cognee/modules/chunking/TextChunker.py
index 64c7aae5c..8ef4bfda9 100644
--- a/cognee/modules/chunking/TextChunker.py
+++ b/cognee/modules/chunking/TextChunker.py
@@ -1,7 +1,10 @@
-from uuid import uuid5, NAMESPACE_OID
+from typing import Optional
+from uuid import NAMESPACE_OID, uuid5
+
+from cognee.tasks.chunks import chunk_by_paragraph
 
 from .models.DocumentChunk import DocumentChunk
-from cognee.tasks.chunks import chunk_by_paragraph
+
 
 class TextChunker():
     document = None
@@ -9,23 +12,34 @@ class TextChunker():
 
     chunk_index = 0
     chunk_size = 0
+    token_count = 0
 
-    def __init__(self, document, get_text: callable, chunk_size: int = 1024):
+    def __init__(self, document, get_text: callable, embedding_model: Optional[str] = None, max_tokens: Optional[int] = None, chunk_size: int = 1024):
         self.document = document
         self.max_chunk_size = chunk_size
         self.get_text = get_text
+        self.max_tokens = max_tokens if max_tokens else float("inf")
+        self.embedding_model = embedding_model
+
+    def check_word_count_and_token_count(self, word_count_before, token_count_before, chunk_data):
+        word_count_fits = word_count_before + chunk_data["word_count"] <= self.max_chunk_size
+        token_count_fits = token_count_before + chunk_data["token_count"] <= self.max_tokens
+        return word_count_fits and token_count_fits
 
     def read(self):
         paragraph_chunks = []
         for content_text in self.get_text():
             for chunk_data in chunk_by_paragraph(
                 content_text,
+                self.embedding_model,
+                self.max_tokens,
                 self.max_chunk_size,
                 batch_paragraphs = True,
             ):
-                if self.chunk_size + chunk_data["word_count"] <= self.max_chunk_size:
+                if self.check_word_count_and_token_count(self.chunk_size, self.token_count, chunk_data):
                     paragraph_chunks.append(chunk_data)
                     self.chunk_size += chunk_data["word_count"]
+                    self.token_count += chunk_data["token_count"]
                 else:
                     if len(paragraph_chunks) == 0:
                         yield DocumentChunk(
@@ -63,6 +77,7 @@ class TextChunker():
                             print(e)
                         paragraph_chunks = [chunk_data]
                         self.chunk_size = chunk_data["word_count"]
+                        self.token_count = chunk_data["token_count"]
 
                     self.chunk_index += 1
 
diff --git a/cognee/modules/data/processing/document_types/Document.py b/cognee/modules/data/processing/document_types/Document.py
index 8d6a3dafb..6712175fb 100644
--- a/cognee/modules/data/processing/document_types/Document.py
+++ b/cognee/modules/data/processing/document_types/Document.py
@@ -1,3 +1,4 @@
+from typing import Optional
 from uuid import UUID
 
 from cognee.infrastructure.engine import DataPoint
@@ -13,5 +14,5 @@ class Document(DataPoint):
         "type": "Document"
     }
 
-    def read(self, chunk_size: int, chunker = str) -> str:
+    def read(self, chunk_size: int, embedding_model: Optional[str], max_tokens: Optional[int], chunker = str) -> str:
         pass
diff --git a/cognee/modules/data/processing/document_types/ImageDocument.py b/cognee/modules/data/processing/document_types/ImageDocument.py
index 352486bd8..1f4f281f8 100644
--- a/cognee/modules/data/processing/document_types/ImageDocument.py
+++ b/cognee/modules/data/processing/document_types/ImageDocument.py
@@ -1,6 +1,10 @@
+from typing import Optional
+
 from cognee.infrastructure.llm.get_llm_client import get_llm_client
-from .Document import Document
+
 from .ChunkerMapping import ChunkerConfig
+from .Document import Document
+
 
 class ImageDocument(Document):
     type: str = "image"
@@ -10,11 +14,11 @@ class ImageDocument(Document):
         result = get_llm_client().transcribe_image(self.raw_data_location)
         return(result.choices[0].message.content)
 
-    def read(self, chunk_size: int, chunker: str):
+    def read(self, chunk_size: int, chunker: str, embedding_model:Optional[str], max_tokens: Optional[int]):
         # Transcribe the image file
         text = self.transcribe_image()
 
         chunker_func = ChunkerConfig.get_chunker(chunker)
-        chunker = chunker_func(self, chunk_size = chunk_size, get_text = lambda: [text])
+        chunker = chunker_func(self, chunk_size = chunk_size, get_text = lambda: [text], embedding_model=embedding_model, max_tokens=max_tokens)
 
         yield from chunker.read()
diff --git a/cognee/modules/data/processing/document_types/PdfDocument.py b/cognee/modules/data/processing/document_types/PdfDocument.py
index 361214718..27dadda33 100644
--- a/cognee/modules/data/processing/document_types/PdfDocument.py
+++ b/cognee/modules/data/processing/document_types/PdfDocument.py
@@ -1,11 +1,15 @@
+from typing import Optional
+
 from pypdf import PdfReader
-from .Document import Document
+
 from .ChunkerMapping import ChunkerConfig
+from .Document import Document
+
 
 class PdfDocument(Document):
     type: str = "pdf"
 
-    def read(self, chunk_size: int, chunker: str):
+    def read(self, chunk_size: int, chunker: str, embedding_model:Optional[str], max_tokens: Optional[int]):
         file = PdfReader(self.raw_data_location)
 
         def get_text():
@@ -14,7 +18,7 @@ class PdfDocument(Document):
                 yield page_text
 
         chunker_func = ChunkerConfig.get_chunker(chunker)
-        chunker = chunker_func(self, chunk_size = chunk_size, get_text = get_text)
+        chunker = chunker_func(self, chunk_size = chunk_size, get_text = get_text, embedding_model=embedding_model, max_tokens=max_tokens)
 
         yield from chunker.read()
 
diff --git a/cognee/modules/data/processing/document_types/TextDocument.py b/cognee/modules/data/processing/document_types/TextDocument.py
index 3952d9845..895a6f8b6 100644
--- a/cognee/modules/data/processing/document_types/TextDocument.py
+++ b/cognee/modules/data/processing/document_types/TextDocument.py
@@ -1,10 +1,13 @@
-from .Document import Document
+from typing import Optional
+
 from .ChunkerMapping import ChunkerConfig
+from .Document import Document
+
 
 class TextDocument(Document):
     type: str = "text"
 
-    def read(self, chunk_size: int, chunker: str):
+    def read(self, chunk_size: int, chunker: str, embedding_model:Optional[str], max_tokens: Optional[int]):
         def get_text():
             with open(self.raw_data_location, mode = "r", encoding = "utf-8") as file:
                 while True:
@@ -17,6 +20,6 @@ class TextDocument(Document):
 
         chunker_func = ChunkerConfig.get_chunker(chunker)
 
-        chunker = chunker_func(self, chunk_size = chunk_size, get_text = get_text)
+        chunker = chunker_func(self, chunk_size = chunk_size, get_text = get_text, embedding_model=embedding_model, max_tokens=max_tokens)
 
         yield from chunker.read()
diff --git a/cognee/modules/data/processing/document_types/UnstructuredDocument.py b/cognee/modules/data/processing/document_types/UnstructuredDocument.py
index 8da065ff5..c94ca4a25 100644
--- a/cognee/modules/data/processing/document_types/UnstructuredDocument.py
+++ b/cognee/modules/data/processing/document_types/UnstructuredDocument.py
@@ -1,14 +1,16 @@
 from io import StringIO
+from typing import Optional
 
 from cognee.modules.chunking.TextChunker import TextChunker
-from .Document import Document
 from cognee.modules.data.exceptions import UnstructuredLibraryImportError
 
+from .Document import Document
+
 
 class UnstructuredDocument(Document):
     type: str = "unstructured"
 
-    def read(self, chunk_size: int, chunker = str) -> str:
+    def read(self, chunk_size: int, chunker: str, embedding_model:Optional[str], max_tokens: Optional[int]) -> str:
         def get_text():
             try:
                 from unstructured.partition.auto import partition
@@ -27,6 +29,6 @@ class UnstructuredDocument(Document):
 
                 yield text
 
-        chunker = TextChunker(self, chunk_size = chunk_size, get_text = get_text)
+        chunker = TextChunker(self, chunk_size = chunk_size, get_text = get_text, embedding_model=embedding_model, max_tokens=max_tokens)
 
         yield from chunker.read()
diff --git a/cognee/tasks/chunks/chunk_by_paragraph.py b/cognee/tasks/chunks/chunk_by_paragraph.py
index 00bb5670c..546d4a1a7 100644
--- a/cognee/tasks/chunks/chunk_by_paragraph.py
+++ b/cognee/tasks/chunks/chunk_by_paragraph.py
@@ -1,8 +1,18 @@
-from uuid import uuid5, NAMESPACE_OID
-from typing import Dict, Any, Iterator
+from typing import Any, Dict, Iterator, Optional, Union
+from uuid import NAMESPACE_OID, uuid5
+
+import tiktoken
+
 from .chunk_by_sentence import chunk_by_sentence
 
-def chunk_by_paragraph(data: str, paragraph_length: int = 1024, batch_paragraphs: bool = True) -> Iterator[Dict[str, Any]]:
+
+def chunk_by_paragraph(
+        data: str, 
+        embedding_model: Optional[str], 
+        max_tokens: Optional[Union[int, float]], 
+        paragraph_length: int = 1024, 
+        batch_paragraphs: bool = True
+    ) -> Iterator[Dict[str, Any]]:
     """
     Chunks text by paragraph while preserving exact text reconstruction capability.
     When chunks are joined with empty string "", they reproduce the original text exactly.
@@ -12,14 +22,22 @@ def chunk_by_paragraph(data: str, paragraph_length: int = 1024, batch_paragraphs
     chunk_index = 0
     paragraph_ids = []
     last_cut_type = None
+    current_token_count = 0
     
     for paragraph_id, sentence, word_count, end_type in chunk_by_sentence(data, maximum_length=paragraph_length):
         # Check if this sentence would exceed length limit
-        if current_word_count > 0 and current_word_count + word_count > paragraph_length:
+        if embedding_model:
+            tokenizer = tiktoken.encoding_for_model(embedding_model)
+            token_count = len(tokenizer.encode(sentence))
+        else:
+            token_count = 0
+
+        if current_word_count > 0 and (current_word_count + word_count > paragraph_length or current_token_count + token_count > max_tokens):
             # Yield current chunk
             chunk_dict = {
                 "text": current_chunk,
                 "word_count": current_word_count,
+                "token_count": current_token_count,
                 "chunk_id": uuid5(NAMESPACE_OID, current_chunk),
                 "paragraph_ids": paragraph_ids,
                 "chunk_index": chunk_index,
@@ -32,11 +50,13 @@ def chunk_by_paragraph(data: str, paragraph_length: int = 1024, batch_paragraphs
             paragraph_ids = []
             current_chunk = ""
             current_word_count = 0
+            current_token_count = 0
             chunk_index += 1
 
         paragraph_ids.append(paragraph_id)
         current_chunk += sentence
         current_word_count += word_count
+        current_token_count += token_count
         
         # Handle end of paragraph
         if end_type in ("paragraph_end", "sentence_cut") and not batch_paragraphs:
@@ -44,6 +64,7 @@ def chunk_by_paragraph(data: str, paragraph_length: int = 1024, batch_paragraphs
             chunk_dict = {
                 "text": current_chunk,
                 "word_count": current_word_count,
+                "token_count": current_token_count,
                 "paragraph_ids": paragraph_ids,
                 "chunk_id": uuid5(NAMESPACE_OID, current_chunk),
                 "chunk_index": chunk_index,
@@ -53,6 +74,7 @@ def chunk_by_paragraph(data: str, paragraph_length: int = 1024, batch_paragraphs
             paragraph_ids = []
             current_chunk = ""
             current_word_count = 0
+            current_token_count = 0
             chunk_index += 1
         
         last_cut_type = end_type
@@ -62,6 +84,7 @@ def chunk_by_paragraph(data: str, paragraph_length: int = 1024, batch_paragraphs
         chunk_dict = {
             "text": current_chunk,
             "word_count": current_word_count,
+            "token_count": current_token_count,
             "chunk_id": uuid5(NAMESPACE_OID, current_chunk),
             "paragraph_ids": paragraph_ids,
             "chunk_index": chunk_index,
diff --git a/cognee/tasks/documents/extract_chunks_from_documents.py b/cognee/tasks/documents/extract_chunks_from_documents.py
index 423b87b69..ddcdb8765 100644
--- a/cognee/tasks/documents/extract_chunks_from_documents.py
+++ b/cognee/tasks/documents/extract_chunks_from_documents.py
@@ -1,7 +1,15 @@
+from typing import Optional
+
 from cognee.modules.data.processing.document_types.Document import Document
 
 
-async def extract_chunks_from_documents(documents: list[Document], chunk_size: int = 1024, chunker = 'text_chunker'):
+async def extract_chunks_from_documents(
+        documents: list[Document], 
+        chunk_size: int = 1024, 
+        chunker='text_chunker', 
+        embedding_model: Optional[str] = None, 
+        max_tokens: Optional[int] = None,
+        ):
     for document in documents:
-        for document_chunk in document.read(chunk_size = chunk_size, chunker = chunker):
+        for document_chunk in document.read(chunk_size=chunk_size, chunker=chunker, embedding_model=embedding_model, max_tokens=max_tokens):
             yield document_chunk

From a774191ed3153442bbdc29a79e90f45c51bc5cc5 Mon Sep 17 00:00:00 2001
From: Rita Aleksziev <alekszievr@gmail.com>
Date: Tue, 7 Jan 2025 13:38:23 +0100
Subject: [PATCH 07/24] Adjust AudioDocument and handle None token limit

---
 .../data/processing/document_types/AudioDocument.py    | 10 +++++++---
 cognee/tasks/chunks/chunk_by_paragraph.py              |  2 ++
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/cognee/modules/data/processing/document_types/AudioDocument.py b/cognee/modules/data/processing/document_types/AudioDocument.py
index 268338703..a59064674 100644
--- a/cognee/modules/data/processing/document_types/AudioDocument.py
+++ b/cognee/modules/data/processing/document_types/AudioDocument.py
@@ -1,6 +1,10 @@
+from typing import Optional
+
 from cognee.infrastructure.llm.get_llm_client import get_llm_client
-from .Document import Document
+
 from .ChunkerMapping import ChunkerConfig
+from .Document import Document
+
 
 class AudioDocument(Document):
     type: str = "audio"
@@ -9,12 +13,12 @@ class AudioDocument(Document):
         result = get_llm_client().create_transcript(self.raw_data_location)
         return(result.text)
 
-    def read(self, chunk_size: int, chunker: str):
+    def read(self, chunk_size: int, chunker: str, embedding_model:Optional[str], max_tokens: Optional[int]):
         # Transcribe the audio file
         
         text = self.create_transcript()
 
         chunker_func = ChunkerConfig.get_chunker(chunker)
-        chunker = chunker_func(self, chunk_size = chunk_size, get_text = lambda: [text])
+        chunker = chunker_func(self, chunk_size = chunk_size, get_text = lambda: [text], embedding_model=embedding_model, max_tokens=max_tokens)
 
         yield from chunker.read()
diff --git a/cognee/tasks/chunks/chunk_by_paragraph.py b/cognee/tasks/chunks/chunk_by_paragraph.py
index 546d4a1a7..2bbd9689f 100644
--- a/cognee/tasks/chunks/chunk_by_paragraph.py
+++ b/cognee/tasks/chunks/chunk_by_paragraph.py
@@ -23,6 +23,8 @@ def chunk_by_paragraph(
     paragraph_ids = []
     last_cut_type = None
     current_token_count = 0
+    if not max_tokens:
+        max_tokens = float("inf")
     
     for paragraph_id, sentence, word_count, end_type in chunk_by_sentence(data, maximum_length=paragraph_length):
         # Check if this sentence would exceed length limit

From fb13a1b61a42c6b02ad85e70644c73aef722c1d7 Mon Sep 17 00:00:00 2001
From: Rita Aleksziev <alekszievr@gmail.com>
Date: Tue, 7 Jan 2025 15:00:58 +0100
Subject: [PATCH 08/24] Handle azure models as well

---
 cognee/tasks/chunks/chunk_by_paragraph.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cognee/tasks/chunks/chunk_by_paragraph.py b/cognee/tasks/chunks/chunk_by_paragraph.py
index 2bbd9689f..b3c191e29 100644
--- a/cognee/tasks/chunks/chunk_by_paragraph.py
+++ b/cognee/tasks/chunks/chunk_by_paragraph.py
@@ -29,6 +29,8 @@ def chunk_by_paragraph(
     for paragraph_id, sentence, word_count, end_type in chunk_by_sentence(data, maximum_length=paragraph_length):
         # Check if this sentence would exceed length limit
         if embedding_model:
+            if embedding_model.startswith("azure/"):
+                embedding_model = embedding_model.split("/")[-1]
             tokenizer = tiktoken.encoding_for_model(embedding_model)
             token_count = len(tokenizer.encode(sentence))
         else:

From 8ffef5034ae560c7514d21ae2b58d1f30013354d Mon Sep 17 00:00:00 2001
From: Rita Aleksziev <alekszievr@gmail.com>
Date: Wed, 8 Jan 2025 12:25:31 +0100
Subject: [PATCH 09/24] Add clean logging to code graph example

---
 examples/python/code_graph_example.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/examples/python/code_graph_example.py b/examples/python/code_graph_example.py
index 9cd9f99c4..afc83beb0 100644
--- a/examples/python/code_graph_example.py
+++ b/examples/python/code_graph_example.py
@@ -1,7 +1,9 @@
 import argparse
 import asyncio
+import logging
 
 from cognee.api.v1.cognify.code_graph_pipeline import run_code_graph_pipeline
+from cognee.shared.utils import setup_logging
 
 
 async def main(repo_path, include_docs):
@@ -43,6 +45,8 @@ def parse_args():
     return parser.parse_args()
 
 if __name__ == "__main__":
+    setup_logging(logging.ERROR)
+
     import os
 
     args = parse_args()

From f4397bf940e3a54a745ac1be19cadb9e33a28ae4 Mon Sep 17 00:00:00 2001
From: Rita Aleksziev <alekszievr@gmail.com>
Date: Wed, 8 Jan 2025 12:33:14 +0100
Subject: [PATCH 10/24] Remove setting envvars from arg

---
 examples/python/code_graph_example.py | 22 ----------------------
 1 file changed, 22 deletions(-)

diff --git a/examples/python/code_graph_example.py b/examples/python/code_graph_example.py
index afc83beb0..16eafb024 100644
--- a/examples/python/code_graph_example.py
+++ b/examples/python/code_graph_example.py
@@ -24,18 +24,6 @@ def parse_args():
         default=True,
         help="Whether or not to process non-code files"
     )
-    parser.add_argument(
-        "--mock_embedding",
-        type=lambda x: x.lower() in ("true", "1"), 
-        default=True,
-        help="Whether or not to mock embedding and code summary"
-    )
-    parser.add_argument(
-        "--mock_code_summary",
-        type=lambda x: x.lower() in ("true", "1"),
-        default=True, 
-        help="Whether or not to mock code summary"
-    )
     parser.add_argument(
         "--time",
         type=lambda x: x.lower() in ("true", "1"),
@@ -47,18 +35,8 @@ def parse_args():
 if __name__ == "__main__":
     setup_logging(logging.ERROR)
 
-    import os
-
     args = parse_args()
     
-    if args.mock_embedding:
-        os.environ["MOCK_EMBEDDING"] = "true"
-        print("Mocking embedding.")
-    
-    if args.mock_code_summary:
-        os.environ["MOCK_CODE_SUMMARY"] = "true"
-        print("Mocking code summary.")
-    
     if args.time:
         import time
         start_time = time.time()

From 34a9267f414efc9553509bfdbf63bbee6aa5be69 Mon Sep 17 00:00:00 2001
From: Rita Aleksziev <alekszievr@gmail.com>
Date: Wed, 8 Jan 2025 13:23:17 +0100
Subject: [PATCH 11/24] Get embedding engine instead of passing it. Get it from
 vector engine instead of direct getter.

---
 cognee/api/v1/cognify/code_graph_pipeline.py  |  2 +-
 cognee/modules/chunking/TextChunker.py        |  6 ++----
 .../document_types/AudioDocument.py           |  4 ++--
 .../processing/document_types/Document.py     |  2 +-
 .../document_types/ImageDocument.py           |  4 ++--
 .../processing/document_types/PdfDocument.py  |  4 ++--
 .../processing/document_types/TextDocument.py |  4 ++--
 .../document_types/UnstructuredDocument.py    |  4 ++--
 cognee/tasks/chunks/chunk_by_paragraph.py     | 19 ++++++++++---------
 .../extract_chunks_from_documents.py          |  3 +--
 10 files changed, 25 insertions(+), 27 deletions(-)

diff --git a/cognee/api/v1/cognify/code_graph_pipeline.py b/cognee/api/v1/cognify/code_graph_pipeline.py
index 2648d0731..7ba461f88 100644
--- a/cognee/api/v1/cognify/code_graph_pipeline.py
+++ b/cognee/api/v1/cognify/code_graph_pipeline.py
@@ -71,7 +71,7 @@ async def run_code_graph_pipeline(repo_path, include_docs=True):
             Task(ingest_data_with_metadata, dataset_name="repo_docs", user=user),
             Task(get_data_list_for_user, dataset_name="repo_docs", user=user),
             Task(classify_documents),
-            Task(extract_chunks_from_documents, embedding_model=embedding_engine.model, max_tokens=8192),
+            Task(extract_chunks_from_documents, max_tokens=8192),
             Task(extract_graph_from_data, graph_model=KnowledgeGraph, task_config={"batch_size": 50}),
             Task(
                 summarize_text,
diff --git a/cognee/modules/chunking/TextChunker.py b/cognee/modules/chunking/TextChunker.py
index 8ef4bfda9..a9cb52bf0 100644
--- a/cognee/modules/chunking/TextChunker.py
+++ b/cognee/modules/chunking/TextChunker.py
@@ -14,13 +14,12 @@ class TextChunker():
     chunk_size = 0
     token_count = 0
 
-    def __init__(self, document, get_text: callable, embedding_model: Optional[str] = None, max_tokens: Optional[int] = None, chunk_size: int = 1024):
+    def __init__(self, document, get_text: callable, max_tokens: Optional[int] = None, chunk_size: int = 1024):
         self.document = document
         self.max_chunk_size = chunk_size
         self.get_text = get_text
         self.max_tokens = max_tokens if max_tokens else float("inf")
-        self.embedding_model = embedding_model
-
+    
     def check_word_count_and_token_count(self, word_count_before, token_count_before, chunk_data):
         word_count_fits = word_count_before + chunk_data["word_count"] <= self.max_chunk_size
         token_count_fits = token_count_before + chunk_data["token_count"] <= self.max_tokens
@@ -31,7 +30,6 @@ class TextChunker():
         for content_text in self.get_text():
             for chunk_data in chunk_by_paragraph(
                 content_text,
-                self.embedding_model,
                 self.max_tokens,
                 self.max_chunk_size,
                 batch_paragraphs = True,
diff --git a/cognee/modules/data/processing/document_types/AudioDocument.py b/cognee/modules/data/processing/document_types/AudioDocument.py
index a59064674..c4e6ae87c 100644
--- a/cognee/modules/data/processing/document_types/AudioDocument.py
+++ b/cognee/modules/data/processing/document_types/AudioDocument.py
@@ -13,12 +13,12 @@ class AudioDocument(Document):
         result = get_llm_client().create_transcript(self.raw_data_location)
         return(result.text)
 
-    def read(self, chunk_size: int, chunker: str, embedding_model:Optional[str], max_tokens: Optional[int]):
+    def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int]):
         # Transcribe the audio file
         
         text = self.create_transcript()
 
         chunker_func = ChunkerConfig.get_chunker(chunker)
-        chunker = chunker_func(self, chunk_size = chunk_size, get_text = lambda: [text], embedding_model=embedding_model, max_tokens=max_tokens)
+        chunker = chunker_func(self, chunk_size = chunk_size, get_text = lambda: [text], max_tokens=max_tokens)
 
         yield from chunker.read()
diff --git a/cognee/modules/data/processing/document_types/Document.py b/cognee/modules/data/processing/document_types/Document.py
index 6712175fb..7c76d3f23 100644
--- a/cognee/modules/data/processing/document_types/Document.py
+++ b/cognee/modules/data/processing/document_types/Document.py
@@ -14,5 +14,5 @@ class Document(DataPoint):
         "type": "Document"
     }
 
-    def read(self, chunk_size: int, embedding_model: Optional[str], max_tokens: Optional[int], chunker = str) -> str:
+    def read(self, chunk_size: int, max_tokens: Optional[int], chunker = str) -> str:
         pass
diff --git a/cognee/modules/data/processing/document_types/ImageDocument.py b/cognee/modules/data/processing/document_types/ImageDocument.py
index 1f4f281f8..ffe8ff3f9 100644
--- a/cognee/modules/data/processing/document_types/ImageDocument.py
+++ b/cognee/modules/data/processing/document_types/ImageDocument.py
@@ -14,11 +14,11 @@ class ImageDocument(Document):
         result = get_llm_client().transcribe_image(self.raw_data_location)
         return(result.choices[0].message.content)
 
-    def read(self, chunk_size: int, chunker: str, embedding_model:Optional[str], max_tokens: Optional[int]):
+    def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int]):
         # Transcribe the image file
         text = self.transcribe_image()
 
         chunker_func = ChunkerConfig.get_chunker(chunker)
-        chunker = chunker_func(self, chunk_size = chunk_size, get_text = lambda: [text], embedding_model=embedding_model, max_tokens=max_tokens)
+        chunker = chunker_func(self, chunk_size = chunk_size, get_text = lambda: [text], max_tokens=max_tokens)
 
         yield from chunker.read()
diff --git a/cognee/modules/data/processing/document_types/PdfDocument.py b/cognee/modules/data/processing/document_types/PdfDocument.py
index 27dadda33..463911d5b 100644
--- a/cognee/modules/data/processing/document_types/PdfDocument.py
+++ b/cognee/modules/data/processing/document_types/PdfDocument.py
@@ -9,7 +9,7 @@ from .Document import Document
 class PdfDocument(Document):
     type: str = "pdf"
 
-    def read(self, chunk_size: int, chunker: str, embedding_model:Optional[str], max_tokens: Optional[int]):
+    def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int]):
         file = PdfReader(self.raw_data_location)
 
         def get_text():
@@ -18,7 +18,7 @@ class PdfDocument(Document):
                 yield page_text
 
         chunker_func = ChunkerConfig.get_chunker(chunker)
-        chunker = chunker_func(self, chunk_size = chunk_size, get_text = get_text, embedding_model=embedding_model, max_tokens=max_tokens)
+        chunker = chunker_func(self, chunk_size = chunk_size, get_text = get_text, max_tokens=max_tokens)
 
         yield from chunker.read()
 
diff --git a/cognee/modules/data/processing/document_types/TextDocument.py b/cognee/modules/data/processing/document_types/TextDocument.py
index 895a6f8b6..582f47737 100644
--- a/cognee/modules/data/processing/document_types/TextDocument.py
+++ b/cognee/modules/data/processing/document_types/TextDocument.py
@@ -7,7 +7,7 @@ from .Document import Document
 class TextDocument(Document):
     type: str = "text"
 
-    def read(self, chunk_size: int, chunker: str, embedding_model:Optional[str], max_tokens: Optional[int]):
+    def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int]):
         def get_text():
             with open(self.raw_data_location, mode = "r", encoding = "utf-8") as file:
                 while True:
@@ -20,6 +20,6 @@ class TextDocument(Document):
 
         chunker_func = ChunkerConfig.get_chunker(chunker)
 
-        chunker = chunker_func(self, chunk_size = chunk_size, get_text = get_text, embedding_model=embedding_model, max_tokens=max_tokens)
+        chunker = chunker_func(self, chunk_size = chunk_size, get_text = get_text, max_tokens=max_tokens)
 
         yield from chunker.read()
diff --git a/cognee/modules/data/processing/document_types/UnstructuredDocument.py b/cognee/modules/data/processing/document_types/UnstructuredDocument.py
index c94ca4a25..6c70744a0 100644
--- a/cognee/modules/data/processing/document_types/UnstructuredDocument.py
+++ b/cognee/modules/data/processing/document_types/UnstructuredDocument.py
@@ -10,7 +10,7 @@ from .Document import Document
 class UnstructuredDocument(Document):
     type: str = "unstructured"
 
-    def read(self, chunk_size: int, chunker: str, embedding_model:Optional[str], max_tokens: Optional[int]) -> str:
+    def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int]) -> str:
         def get_text():
             try:
                 from unstructured.partition.auto import partition
@@ -29,6 +29,6 @@ class UnstructuredDocument(Document):
 
                 yield text
 
-        chunker = TextChunker(self, chunk_size = chunk_size, get_text = get_text, embedding_model=embedding_model, max_tokens=max_tokens)
+        chunker = TextChunker(self, chunk_size = chunk_size, get_text = get_text, max_tokens=max_tokens)
 
         yield from chunker.read()
diff --git a/cognee/tasks/chunks/chunk_by_paragraph.py b/cognee/tasks/chunks/chunk_by_paragraph.py
index b3c191e29..8ab66bd7f 100644
--- a/cognee/tasks/chunks/chunk_by_paragraph.py
+++ b/cognee/tasks/chunks/chunk_by_paragraph.py
@@ -3,12 +3,13 @@ from uuid import NAMESPACE_OID, uuid5
 
 import tiktoken
 
+from cognee.infrastructure.databases.vector import get_vector_engine
+
 from .chunk_by_sentence import chunk_by_sentence
 
 
 def chunk_by_paragraph(
         data: str, 
-        embedding_model: Optional[str], 
         max_tokens: Optional[Union[int, float]], 
         paragraph_length: int = 1024, 
         batch_paragraphs: bool = True
@@ -26,16 +27,16 @@ def chunk_by_paragraph(
     if not max_tokens:
         max_tokens = float("inf")
     
+    vector_engine = get_vector_engine()
+    embedding_model = vector_engine.embedding_engine.model
+
     for paragraph_id, sentence, word_count, end_type in chunk_by_sentence(data, maximum_length=paragraph_length):
         # Check if this sentence would exceed length limit
-        if embedding_model:
-            if embedding_model.startswith("azure/"):
-                embedding_model = embedding_model.split("/")[-1]
-            tokenizer = tiktoken.encoding_for_model(embedding_model)
-            token_count = len(tokenizer.encode(sentence))
-        else:
-            token_count = 0
-
+        
+        embedding_model = embedding_model.split("/")[-1]
+        tokenizer = tiktoken.encoding_for_model(embedding_model)
+        token_count = len(tokenizer.encode(sentence))
+    
         if current_word_count > 0 and (current_word_count + word_count > paragraph_length or current_token_count + token_count > max_tokens):
             # Yield current chunk
             chunk_dict = {
diff --git a/cognee/tasks/documents/extract_chunks_from_documents.py b/cognee/tasks/documents/extract_chunks_from_documents.py
index ddcdb8765..e647afbef 100644
--- a/cognee/tasks/documents/extract_chunks_from_documents.py
+++ b/cognee/tasks/documents/extract_chunks_from_documents.py
@@ -7,9 +7,8 @@ async def extract_chunks_from_documents(
         documents: list[Document], 
         chunk_size: int = 1024, 
         chunker='text_chunker', 
-        embedding_model: Optional[str] = None, 
         max_tokens: Optional[int] = None,
         ):
     for document in documents:
-        for document_chunk in document.read(chunk_size=chunk_size, chunker=chunker, embedding_model=embedding_model, max_tokens=max_tokens):
+        for document_chunk in document.read(chunk_size=chunk_size, chunker=chunker, max_tokens=max_tokens):
             yield document_chunk

From 97814e334f282b344cb0357df387b70cbf801397 Mon Sep 17 00:00:00 2001
From: Rita Aleksziev <alekszievr@gmail.com>
Date: Wed, 8 Jan 2025 13:45:04 +0100
Subject: [PATCH 12/24] Get embedding engine instead of passing it in code
 chunking.

---
 cognee/api/v1/cognify/code_graph_pipeline.py          | 6 +-----
 cognee/tasks/chunks/chunk_by_paragraph.py             | 4 ++--
 cognee/tasks/repo_processor/get_source_code_chunks.py | 9 ++++++---
 3 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/cognee/api/v1/cognify/code_graph_pipeline.py b/cognee/api/v1/cognify/code_graph_pipeline.py
index 7ba461f88..6e06edfa3 100644
--- a/cognee/api/v1/cognify/code_graph_pipeline.py
+++ b/cognee/api/v1/cognify/code_graph_pipeline.py
@@ -3,8 +3,6 @@ import logging
 from pathlib import Path
 
 from cognee.base_config import get_base_config
-from cognee.infrastructure.databases.vector.embeddings import \
-    get_embedding_engine
 from cognee.modules.cognify.config import get_cognify_config
 from cognee.modules.pipelines import run_tasks
 from cognee.modules.pipelines.tasks.Task import Task
@@ -51,8 +49,6 @@ async def run_code_graph_pipeline(repo_path, include_docs=True):
     await cognee.prune.prune_system(metadata=True)
     await create_db_and_tables()
 
-    embedding_engine = get_embedding_engine()
-
     cognee_config = get_cognify_config()
     user = await get_default_user()
 
@@ -60,7 +56,7 @@ async def run_code_graph_pipeline(repo_path, include_docs=True):
         Task(get_repo_file_dependencies),
         Task(enrich_dependency_graph),
         Task(expand_dependency_graph, task_config={"batch_size": 50}),
-        Task(get_source_code_chunks, embedding_model=embedding_engine.model, task_config={"batch_size": 50}),
+        Task(get_source_code_chunks, task_config={"batch_size": 50}),
         Task(summarize_code, task_config={"batch_size": 50}),
         Task(add_data_points, task_config={"batch_size": 50}),
     ]
diff --git a/cognee/tasks/chunks/chunk_by_paragraph.py b/cognee/tasks/chunks/chunk_by_paragraph.py
index 8ab66bd7f..44355a1ad 100644
--- a/cognee/tasks/chunks/chunk_by_paragraph.py
+++ b/cognee/tasks/chunks/chunk_by_paragraph.py
@@ -29,11 +29,11 @@ def chunk_by_paragraph(
     
     vector_engine = get_vector_engine()
     embedding_model = vector_engine.embedding_engine.model
-
+    embedding_model = embedding_model.split("/")[-1]
+        
     for paragraph_id, sentence, word_count, end_type in chunk_by_sentence(data, maximum_length=paragraph_length):
         # Check if this sentence would exceed length limit
         
-        embedding_model = embedding_model.split("/")[-1]
         tokenizer = tiktoken.encoding_for_model(embedding_model)
         token_count = len(tokenizer.encode(sentence))
     
diff --git a/cognee/tasks/repo_processor/get_source_code_chunks.py b/cognee/tasks/repo_processor/get_source_code_chunks.py
index 4d0ce3200..0bf7ebe32 100644
--- a/cognee/tasks/repo_processor/get_source_code_chunks.py
+++ b/cognee/tasks/repo_processor/get_source_code_chunks.py
@@ -5,6 +5,7 @@ from uuid import NAMESPACE_OID, uuid5
 import parso
 import tiktoken
 
+from cognee.infrastructure.databases.vector import get_vector_engine
 from cognee.infrastructure.engine import DataPoint
 from cognee.shared.CodeGraphEntities import CodeFile, CodePart, SourceCodeChunk
 
@@ -115,13 +116,15 @@ def get_source_code_chunks_from_code_part(
         max_tokens: int = 8192,
         overlap: float = 0.25,
         granularity: float = 0.1,
-        model_name: str = "text-embedding-3-large"
 ) -> Generator[SourceCodeChunk, None, None]:
     """Yields source code chunks from a CodePart object, with configurable token limits and overlap."""
     if not code_file_part.source_code:
         logger.error(f"No source code in CodeFile {code_file_part.id}")
         return
 
+    vector_engine = get_vector_engine()
+    embedding_model = vector_engine.embedding_engine.model
+    model_name = embedding_model.split("/")[-1]
     tokenizer = tiktoken.encoding_for_model(model_name)
     max_subchunk_tokens = max(1, int(granularity * max_tokens))
     subchunk_token_counts = _get_subchunk_token_counts(tokenizer, code_file_part.source_code, max_subchunk_tokens)
@@ -141,7 +144,7 @@ def get_source_code_chunks_from_code_part(
         previous_chunk = current_chunk
 
 
-async def get_source_code_chunks(data_points: list[DataPoint], embedding_model="text-embedding-3-large") -> \
+async def get_source_code_chunks(data_points: list[DataPoint]) -> \
         AsyncGenerator[list[DataPoint], None]:
     """Processes code graph datapoints, create SourceCodeChink datapoints."""
     # TODO: Add support for other embedding models, with max_token mapping
@@ -156,7 +159,7 @@ async def get_source_code_chunks(data_points: list[DataPoint], embedding_model="
             for code_part in data_point.contains:
                 try:
                     yield code_part
-                    for source_code_chunk in get_source_code_chunks_from_code_part(code_part, model_name=embedding_model):
+                    for source_code_chunk in get_source_code_chunks_from_code_part(code_part):
                         yield source_code_chunk
                 except Exception as e:
                     logger.error(f"Error processing code part: {e}")

From abb3ea6d219f8221500fb7a7e7f6cc404cf75b08 Mon Sep 17 00:00:00 2001
From: Rita Aleksziev <alekszievr@gmail.com>
Date: Thu, 9 Jan 2025 11:31:16 +0100
Subject: [PATCH 13/24] Adjust integration tests

---
 .../data/processing/document_types/AudioDocument.py       | 2 +-
 cognee/modules/data/processing/document_types/Document.py | 2 +-
 .../data/processing/document_types/ImageDocument.py       | 2 +-
 .../modules/data/processing/document_types/PdfDocument.py | 2 +-
 .../data/processing/document_types/TextDocument.py        | 2 +-
 .../processing/document_types/UnstructuredDocument.py     | 2 +-
 .../integration/documents/UnstructuredDocument_test.py    | 8 ++++----
 7 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/cognee/modules/data/processing/document_types/AudioDocument.py b/cognee/modules/data/processing/document_types/AudioDocument.py
index faace056b..b7d2476b4 100644
--- a/cognee/modules/data/processing/document_types/AudioDocument.py
+++ b/cognee/modules/data/processing/document_types/AudioDocument.py
@@ -13,7 +13,7 @@ class AudioDocument(Document):
         result = get_llm_client().create_transcript(self.raw_data_location)
         return result.text
 
-    def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int]):
+    def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int] = None):
         # Transcribe the audio file
 
         text = self.create_transcript()
diff --git a/cognee/modules/data/processing/document_types/Document.py b/cognee/modules/data/processing/document_types/Document.py
index 9a29e7797..7ecdf289e 100644
--- a/cognee/modules/data/processing/document_types/Document.py
+++ b/cognee/modules/data/processing/document_types/Document.py
@@ -11,5 +11,5 @@ class Document(DataPoint):
     mime_type: str
     _metadata: dict = {"index_fields": ["name"], "type": "Document"}
 
-    def read(self, chunk_size: int, max_tokens: Optional[int], chunker=str) -> str:
+    def read(self, chunk_size: int, chunker=str, max_tokens: Optional[int] = None) -> str:
         pass
diff --git a/cognee/modules/data/processing/document_types/ImageDocument.py b/cognee/modules/data/processing/document_types/ImageDocument.py
index f0c7a6d61..c055b8253 100644
--- a/cognee/modules/data/processing/document_types/ImageDocument.py
+++ b/cognee/modules/data/processing/document_types/ImageDocument.py
@@ -13,7 +13,7 @@ class ImageDocument(Document):
         result = get_llm_client().transcribe_image(self.raw_data_location)
         return result.choices[0].message.content
 
-    def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int]):
+    def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int] = None):
         # Transcribe the image file
         text = self.transcribe_image()
 
diff --git a/cognee/modules/data/processing/document_types/PdfDocument.py b/cognee/modules/data/processing/document_types/PdfDocument.py
index 56969c7f8..768f91264 100644
--- a/cognee/modules/data/processing/document_types/PdfDocument.py
+++ b/cognee/modules/data/processing/document_types/PdfDocument.py
@@ -9,7 +9,7 @@ from .Document import Document
 class PdfDocument(Document):
     type: str = "pdf"
 
-    def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int]):
+    def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int] = None):
         file = PdfReader(self.raw_data_location)
 
         def get_text():
diff --git a/cognee/modules/data/processing/document_types/TextDocument.py b/cognee/modules/data/processing/document_types/TextDocument.py
index 11dc798aa..b62ccd56e 100644
--- a/cognee/modules/data/processing/document_types/TextDocument.py
+++ b/cognee/modules/data/processing/document_types/TextDocument.py
@@ -7,7 +7,7 @@ from .Document import Document
 class TextDocument(Document):
     type: str = "text"
 
-    def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int]):
+    def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int] = None):
         def get_text():
             with open(self.raw_data_location, mode="r", encoding="utf-8") as file:
                 while True:
diff --git a/cognee/modules/data/processing/document_types/UnstructuredDocument.py b/cognee/modules/data/processing/document_types/UnstructuredDocument.py
index d6b64498c..1c291d0dc 100644
--- a/cognee/modules/data/processing/document_types/UnstructuredDocument.py
+++ b/cognee/modules/data/processing/document_types/UnstructuredDocument.py
@@ -10,7 +10,7 @@ from .Document import Document
 class UnstructuredDocument(Document):
     type: str = "unstructured"
 
-    def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int]) -> str:
+    def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int] = None) -> str:
         def get_text():
             try:
                 from unstructured.partition.auto import partition
diff --git a/cognee/tests/integration/documents/UnstructuredDocument_test.py b/cognee/tests/integration/documents/UnstructuredDocument_test.py
index 03b8deb49..e0278de81 100644
--- a/cognee/tests/integration/documents/UnstructuredDocument_test.py
+++ b/cognee/tests/integration/documents/UnstructuredDocument_test.py
@@ -68,7 +68,7 @@ def test_UnstructuredDocument():
     )
 
     # Test PPTX
-    for paragraph_data in pptx_document.read(chunk_size=1024):
+    for paragraph_data in pptx_document.read(chunk_size=1024, chunker="text_chunker"):
         assert 19 == paragraph_data.word_count, f" 19 != {paragraph_data.word_count = }"
         assert 104 == len(paragraph_data.text), f" 104 != {len(paragraph_data.text) = }"
         assert (
@@ -76,7 +76,7 @@ def test_UnstructuredDocument():
         ), f" sentence_cut != {paragraph_data.cut_type = }"
 
     # Test DOCX
-    for paragraph_data in docx_document.read(chunk_size=1024):
+    for paragraph_data in docx_document.read(chunk_size=1024, chunker="text_chunker"):
         assert 16 == paragraph_data.word_count, f" 16 != {paragraph_data.word_count = }"
         assert 145 == len(paragraph_data.text), f" 145 != {len(paragraph_data.text) = }"
         assert (
@@ -84,7 +84,7 @@ def test_UnstructuredDocument():
         ), f" sentence_end != {paragraph_data.cut_type = }"
 
     # TEST CSV
-    for paragraph_data in csv_document.read(chunk_size=1024):
+    for paragraph_data in csv_document.read(chunk_size=1024, chunker="text_chunker"):
         assert 15 == paragraph_data.word_count, f" 15 != {paragraph_data.word_count = }"
         assert (
             "A A A A A A A A A,A A A A A A,A A" == paragraph_data.text
@@ -94,7 +94,7 @@ def test_UnstructuredDocument():
         ), f" sentence_cut != {paragraph_data.cut_type = }"
 
     # Test XLSX
-    for paragraph_data in xlsx_document.read(chunk_size=1024):
+    for paragraph_data in xlsx_document.read(chunk_size=1024, chunker="text_chunker"):
         assert 36 == paragraph_data.word_count, f" 36 != {paragraph_data.word_count = }"
         assert 171 == len(paragraph_data.text), f" 171 != {len(paragraph_data.text) = }"
         assert (

From cdaae161a8e006415988a45ca529d5bc71f9d632 Mon Sep 17 00:00:00 2001
From: Rita Aleksziev <alekszievr@gmail.com>
Date: Thu, 9 Jan 2025 12:08:42 +0100
Subject: [PATCH 14/24] Handle circular import

---
 cognee/tasks/repo_processor/__init__.py                | 3 ---
 cognee/tasks/repo_processor/expand_dependency_graph.py | 5 ++++-
 cognee/tasks/repo_processor/extract_code_parts.py      | 4 +++-
 cognee/tasks/repo_processor/get_local_dependencies.py  | 4 +++-
 4 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/cognee/tasks/repo_processor/__init__.py b/cognee/tasks/repo_processor/__init__.py
index 6dc032547..8f0df23d8 100644
--- a/cognee/tasks/repo_processor/__init__.py
+++ b/cognee/tasks/repo_processor/__init__.py
@@ -2,6 +2,3 @@ from .enrich_dependency_graph import enrich_dependency_graph
 from .expand_dependency_graph import expand_dependency_graph
 from .get_non_code_files import get_data_list_for_user, get_non_py_files
 from .get_repo_file_dependencies import get_repo_file_dependencies
-import logging
-
-logger = logging.getLogger("task:repo_processor")
diff --git a/cognee/tasks/repo_processor/expand_dependency_graph.py b/cognee/tasks/repo_processor/expand_dependency_graph.py
index de26fe8d4..d3f5d1b07 100644
--- a/cognee/tasks/repo_processor/expand_dependency_graph.py
+++ b/cognee/tasks/repo_processor/expand_dependency_graph.py
@@ -5,7 +5,10 @@ from uuid import NAMESPACE_OID, uuid5
 from cognee.infrastructure.engine import DataPoint
 from cognee.shared.CodeGraphEntities import CodeFile, CodePart
 from cognee.tasks.repo_processor.extract_code_parts import extract_code_parts
-from cognee.tasks.repo_processor import logger
+
+import logging
+
+logger = logging.getLogger("task:repo_processor")
 
 
 def _add_code_parts_nodes_and_edges(code_file: CodeFile, part_type, code_parts) -> None:
diff --git a/cognee/tasks/repo_processor/extract_code_parts.py b/cognee/tasks/repo_processor/extract_code_parts.py
index 76cfef538..c181a87d9 100644
--- a/cognee/tasks/repo_processor/extract_code_parts.py
+++ b/cognee/tasks/repo_processor/extract_code_parts.py
@@ -1,7 +1,9 @@
 from typing import Dict, List
 import parso
 
-from cognee.tasks.repo_processor import logger
+import logging
+
+logger = logging.getLogger("task:repo_processor")
 
 
 def _extract_parts_from_module(module, parts_dict: Dict[str, List[str]]) -> Dict[str, List[str]]:
diff --git a/cognee/tasks/repo_processor/get_local_dependencies.py b/cognee/tasks/repo_processor/get_local_dependencies.py
index b443829c9..92b50cd0b 100644
--- a/cognee/tasks/repo_processor/get_local_dependencies.py
+++ b/cognee/tasks/repo_processor/get_local_dependencies.py
@@ -10,7 +10,9 @@ import jedi
 import parso
 from parso.tree import BaseNode
 
-from cognee.tasks.repo_processor import logger
+import logging
+
+logger = logging.getLogger("task:repo_processor")
 
 
 @contextmanager

From 626bc76f5ccdb830e741cf74464e1e3a967dec75 Mon Sep 17 00:00:00 2001
From: Rita Aleksziev <alekszievr@gmail.com>
Date: Thu, 9 Jan 2025 12:53:26 +0100
Subject: [PATCH 15/24] Set max_tokens in config

---
 cognee/api/v1/cognify/code_graph_pipeline.py | 2 +-
 cognee/modules/cognify/config.py             | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/cognee/api/v1/cognify/code_graph_pipeline.py b/cognee/api/v1/cognify/code_graph_pipeline.py
index 53c41d43b..2d077f39b 100644
--- a/cognee/api/v1/cognify/code_graph_pipeline.py
+++ b/cognee/api/v1/cognify/code_graph_pipeline.py
@@ -71,7 +71,7 @@ async def run_code_graph_pipeline(repo_path, include_docs=True):
             Task(ingest_data_with_metadata, dataset_name="repo_docs", user=user),
             Task(get_data_list_for_user, dataset_name="repo_docs", user=user),
             Task(classify_documents),
-            Task(extract_chunks_from_documents, max_tokens=8192),
+            Task(extract_chunks_from_documents, max_tokens=cognee_config.max_tokens),
             Task(
                 extract_graph_from_data, graph_model=KnowledgeGraph, task_config={"batch_size": 50}
             ),
diff --git a/cognee/modules/cognify/config.py b/cognee/modules/cognify/config.py
index d40410bfc..dd94d8b41 100644
--- a/cognee/modules/cognify/config.py
+++ b/cognee/modules/cognify/config.py
@@ -1,12 +1,14 @@
 from functools import lru_cache
 from pydantic_settings import BaseSettings, SettingsConfigDict
 from cognee.shared.data_models import DefaultContentPrediction, SummarizedContent
+from typing import Optional
+import os
 
 
 class CognifyConfig(BaseSettings):
     classification_model: object = DefaultContentPrediction
     summarization_model: object = SummarizedContent
-
+    max_tokens: Optional[int] = os.getenv("MAX_TOKENS")
     model_config = SettingsConfigDict(env_file=".env", extra="allow")
 
     def to_dict(self) -> dict:

From d7b2186300db585ee5dd1affb20e85e3017a1606 Mon Sep 17 00:00:00 2001
From: Rita Aleksziev <alekszievr@gmail.com>
Date: Thu, 9 Jan 2025 14:27:37 +0100
Subject: [PATCH 16/24] Adjust SWE-bench script to code graph pipeline call

---
 evals/eval_swe_bench.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/evals/eval_swe_bench.py b/evals/eval_swe_bench.py
index 789c95ab4..509530685 100644
--- a/evals/eval_swe_bench.py
+++ b/evals/eval_swe_bench.py
@@ -34,9 +34,8 @@ def check_install_package(package_name):
 
 async def generate_patch_with_cognee(instance, llm_client, search_type=SearchType.CHUNKS):
     repo_path = download_github_repo(instance, "../RAW_GIT_REPOS")
-    pipeline = await run_code_graph_pipeline(repo_path)
 
-    async for result in pipeline:
+    async for result in run_code_graph_pipeline(repo_path, include_docs=True):
         print(result)
 
     print("Here we have the repo under the repo_path")
@@ -47,7 +46,16 @@ async def generate_patch_with_cognee(instance, llm_client, search_type=SearchTyp
     instructions = read_query_prompt("patch_gen_kg_instructions.txt")
 
     retrieved_edges = await brute_force_triplet_search(
-        problem_statement, top_k=3, collections=["data_point_source_code", "data_point_text"]
+        problem_statement,
+        top_k=3,
+        collections=[
+            "code_summary_text",
+            "data_point_name",
+            "document_chunk_text",
+            "entity_name",
+            "entity_type_name",
+            "sourcecodechunk_source_code",
+        ],
     )
 
     retrieved_edges_str = retrieved_edges_to_string(retrieved_edges)

From 18bb282fbc7fb3c7e770641bd0b64fa38af7dd92 Mon Sep 17 00:00:00 2001
From: Rita Aleksziev <alekszievr@gmail.com>
Date: Thu, 9 Jan 2025 14:27:37 +0100
Subject: [PATCH 17/24] Adjust SWE-bench script to code graph pipeline call

---
 evals/eval_swe_bench.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/evals/eval_swe_bench.py b/evals/eval_swe_bench.py
index 789c95ab4..20e005751 100644
--- a/evals/eval_swe_bench.py
+++ b/evals/eval_swe_bench.py
@@ -34,9 +34,8 @@ def check_install_package(package_name):
 
 async def generate_patch_with_cognee(instance, llm_client, search_type=SearchType.CHUNKS):
     repo_path = download_github_repo(instance, "../RAW_GIT_REPOS")
-    pipeline = await run_code_graph_pipeline(repo_path)
 
-    async for result in pipeline:
+    async for result in run_code_graph_pipeline(repo_path, include_docs=True):
         print(result)
 
     print("Here we have the repo under the repo_path")
@@ -47,7 +46,9 @@ async def generate_patch_with_cognee(instance, llm_client, search_type=SearchTyp
     instructions = read_query_prompt("patch_gen_kg_instructions.txt")
 
     retrieved_edges = await brute_force_triplet_search(
-        problem_statement, top_k=3, collections=["data_point_source_code", "data_point_text"]
+        problem_statement,
+        top_k=3,
+        collections=["code_summary_text"],
     )
 
     retrieved_edges_str = retrieved_edges_to_string(retrieved_edges)

From 56cc2233027e80dda25fe0fb7ee14347915800d6 Mon Sep 17 00:00:00 2001
From: hajdul88 <52442977+hajdul88@users.noreply.github.com>
Date: Thu, 9 Jan 2025 16:46:41 +0100
Subject: [PATCH 18/24] feat: adds pydantic types to graph layer models

---
 cognee/modules/chunking/models/DocumentChunk.py | 1 +
 cognee/modules/engine/models/Entity.py          | 1 +
 cognee/modules/engine/models/EntityType.py      | 1 +
 cognee/shared/CodeGraphEntities.py              | 4 ++++
 cognee/tasks/summarization/models.py            | 1 +
 5 files changed, 8 insertions(+)

diff --git a/cognee/modules/chunking/models/DocumentChunk.py b/cognee/modules/chunking/models/DocumentChunk.py
index 4920e9b06..a232d50a1 100644
--- a/cognee/modules/chunking/models/DocumentChunk.py
+++ b/cognee/modules/chunking/models/DocumentChunk.py
@@ -12,6 +12,7 @@ class DocumentChunk(DataPoint):
     chunk_index: int
     cut_type: str
     is_part_of: Document
+    pydantic_type: str = "DocumentChunk"
     contains: List[Entity] = None
 
     _metadata: dict = {"index_fields": ["text"], "type": "DocumentChunk"}
diff --git a/cognee/modules/engine/models/Entity.py b/cognee/modules/engine/models/Entity.py
index 63a153bf2..0e57d5dba 100644
--- a/cognee/modules/engine/models/Entity.py
+++ b/cognee/modules/engine/models/Entity.py
@@ -7,5 +7,6 @@ class Entity(DataPoint):
     name: str
     is_a: EntityType
     description: str
+    pydantic_type: str = "Entity"
 
     _metadata: dict = {"index_fields": ["name"], "type": "Entity"}
diff --git a/cognee/modules/engine/models/EntityType.py b/cognee/modules/engine/models/EntityType.py
index 7225bb3ae..10799bb33 100644
--- a/cognee/modules/engine/models/EntityType.py
+++ b/cognee/modules/engine/models/EntityType.py
@@ -5,5 +5,6 @@ class EntityType(DataPoint):
     __tablename__ = "entity_type"
     name: str
     description: str
+    pydantic_type: str = "EntityType"
 
     _metadata: dict = {"index_fields": ["name"], "type": "EntityType"}
diff --git a/cognee/shared/CodeGraphEntities.py b/cognee/shared/CodeGraphEntities.py
index 164327da0..926aae9fa 100644
--- a/cognee/shared/CodeGraphEntities.py
+++ b/cognee/shared/CodeGraphEntities.py
@@ -5,12 +5,14 @@ from cognee.infrastructure.engine import DataPoint
 class Repository(DataPoint):
     __tablename__ = "Repository"
     path: str
+    pydantic_type: str = "Repository"
     _metadata: dict = {"index_fields": [], "type": "Repository"}
 
 
 class CodeFile(DataPoint):
     __tablename__ = "codefile"
     extracted_id: str  # actually file path
+    pydantic_type: str = "CodeFile"
     source_code: Optional[str] = None
     part_of: Optional[Repository] = None
     depends_on: Optional[List["CodeFile"]] = None
@@ -22,6 +24,7 @@ class CodeFile(DataPoint):
 class CodePart(DataPoint):
     __tablename__ = "codepart"
     # part_of: Optional[CodeFile] = None
+    pydantic_type: str = "CodePart"
     source_code: Optional[str] = None
     _metadata: dict = {"index_fields": [], "type": "CodePart"}
 
@@ -30,6 +33,7 @@ class SourceCodeChunk(DataPoint):
     __tablename__ = "sourcecodechunk"
     code_chunk_of: Optional[CodePart] = None
     source_code: Optional[str] = None
+    pydantic_type: str = "SourceCodeChunk"
     previous_chunk: Optional["SourceCodeChunk"] = None
 
     _metadata: dict = {"index_fields": ["source_code"], "type": "SourceCodeChunk"}
diff --git a/cognee/tasks/summarization/models.py b/cognee/tasks/summarization/models.py
index fc62209ce..bc7b4886d 100644
--- a/cognee/tasks/summarization/models.py
+++ b/cognee/tasks/summarization/models.py
@@ -17,5 +17,6 @@ class CodeSummary(DataPoint):
     __tablename__ = "code_summary"
     text: str
     summarizes: Union[CodeFile, CodePart, SourceCodeChunk]
+    pydantic_type: str = "CodeSummary"
 
     _metadata: dict = {"index_fields": ["text"], "type": "CodeSummary"}

From 9604d95ba515ecb1056f2f103a1e83e581c546dc Mon Sep 17 00:00:00 2001
From: hajdul88 <52442977+hajdul88@users.noreply.github.com>
Date: Thu, 9 Jan 2025 19:54:58 +0100
Subject: [PATCH 19/24] feat: adds basic retriever for swe bench

---
 .../description_to_codepart_search.py         | 31 +++++++----
 cognee/shared/data_models.py                  |  4 ++
 evals/eval_swe_bench.py                       | 53 ++++++++-----------
 3 files changed, 49 insertions(+), 39 deletions(-)

diff --git a/cognee/modules/retrieval/description_to_codepart_search.py b/cognee/modules/retrieval/description_to_codepart_search.py
index ecd187907..fec17fb16 100644
--- a/cognee/modules/retrieval/description_to_codepart_search.py
+++ b/cognee/modules/retrieval/description_to_codepart_search.py
@@ -10,7 +10,7 @@ from cognee.modules.users.models import User
 from cognee.shared.utils import send_telemetry
 
 
-async def code_description_to_code_part_search(query: str, user: User = None, top_k=2) -> list:
+async def code_description_to_code_part_search(query: str, user: User = None, top_k=5) -> list:
     if user is None:
         user = await get_default_user()
 
@@ -55,21 +55,23 @@ async def code_description_to_code_part(query: str, user: User, top_k: int) -> L
     )
 
     try:
-        results = await vector_engine.search("code_summary_text", query_text=query, limit=top_k)
-        if not results:
+        code_summaries = await vector_engine.search(
+            "code_summary_text", query_text=query, limit=top_k
+        )
+        if not code_summaries:
             logging.warning("No results found for query: '%s' by user: %s", query, user.id)
             return []
 
         memory_fragment = CogneeGraph()
         await memory_fragment.project_graph_from_db(
             graph_engine,
-            node_properties_to_project=["id", "type", "text", "source_code"],
+            node_properties_to_project=["id", "type", "text", "source_code", "pydantic_type"],
             edge_properties_to_project=["relationship_name"],
         )
 
         code_pieces_to_return = set()
 
-        for node in results:
+        for node in code_summaries:
             node_id = str(node.id)
             node_to_search_from = memory_fragment.get_node(node_id)
 
@@ -78,9 +80,16 @@ async def code_description_to_code_part(query: str, user: User, top_k: int) -> L
                 continue
 
             for code_file in node_to_search_from.get_skeleton_neighbours():
-                for code_file_edge in code_file.get_skeleton_edges():
-                    if code_file_edge.get_attribute("relationship_name") == "contains":
-                        code_pieces_to_return.add(code_file_edge.get_destination_node())
+                if code_file.get_attribute("pydantic_type") == "SourceCodeChunk":
+                    for code_file_edge in code_file.get_skeleton_edges():
+                        if code_file_edge.get_attribute("relationship_name") == "code_chunk_of":
+                            code_pieces_to_return.add(code_file_edge.get_destination_node())
+                elif code_file.get_attribute("pydantic_type") == "CodePart":
+                    code_pieces_to_return.add(code_file)
+                elif code_file.get_attribute("pydantic_type") == "CodeFile":
+                    for code_file_edge in code_file.get_skeleton_edges():
+                        if code_file_edge.get_attribute("relationship_name") == "contains":
+                            code_pieces_to_return.add(code_file_edge.get_destination_node())
 
         logging.info(
             "Search completed for user: %s, query: '%s'. Found %d code pieces.",
@@ -89,7 +98,11 @@ async def code_description_to_code_part(query: str, user: User, top_k: int) -> L
             len(code_pieces_to_return),
         )
 
-        return list(code_pieces_to_return)
+        context = ""
+        for code_piece in code_pieces_to_return:
+            context = context + code_piece.get_attribute("source_code")
+
+        return context
 
     except Exception as exec_error:
         logging.error(
diff --git a/cognee/shared/data_models.py b/cognee/shared/data_models.py
index d23d2841c..a36a09010 100644
--- a/cognee/shared/data_models.py
+++ b/cognee/shared/data_models.py
@@ -231,6 +231,7 @@ class SummarizedContent(BaseModel):
 
     summary: str
     description: str
+    pydantic_type: str = "SummarizedContent"
 
 
 class SummarizedFunction(BaseModel):
@@ -239,6 +240,7 @@ class SummarizedFunction(BaseModel):
     inputs: Optional[List[str]] = None
     outputs: Optional[List[str]] = None
     decorators: Optional[List[str]] = None
+    pydantic_type: str = "SummarizedFunction"
 
 
 class SummarizedClass(BaseModel):
@@ -246,6 +248,7 @@ class SummarizedClass(BaseModel):
     description: str
     methods: Optional[List[SummarizedFunction]] = None
     decorators: Optional[List[str]] = None
+    pydantic_type: str = "SummarizedClass"
 
 
 class SummarizedCode(BaseModel):
@@ -256,6 +259,7 @@ class SummarizedCode(BaseModel):
     classes: List[SummarizedClass] = []
     functions: List[SummarizedFunction] = []
     workflow_description: Optional[str] = None
+    pydantic_type: str = "SummarizedCode"
 
 
 class GraphDBType(Enum):
diff --git a/evals/eval_swe_bench.py b/evals/eval_swe_bench.py
index 20e005751..b5fcc616b 100644
--- a/evals/eval_swe_bench.py
+++ b/evals/eval_swe_bench.py
@@ -11,7 +11,9 @@ from cognee.api.v1.cognify.code_graph_pipeline import run_code_graph_pipeline
 from cognee.api.v1.search import SearchType
 from cognee.infrastructure.llm.get_llm_client import get_llm_client
 from cognee.infrastructure.llm.prompts import read_query_prompt
-from cognee.modules.retrieval.brute_force_triplet_search import brute_force_triplet_search
+from cognee.modules.retrieval.description_to_codepart_search import (
+    code_description_to_code_part_search,
+)
 from cognee.shared.utils import render_graph
 from evals.eval_utils import download_github_repo, retrieved_edges_to_string
 
@@ -32,26 +34,16 @@ def check_install_package(package_name):
             return False
 
 
-async def generate_patch_with_cognee(instance, llm_client, search_type=SearchType.CHUNKS):
-    repo_path = download_github_repo(instance, "../RAW_GIT_REPOS")
-
-    async for result in run_code_graph_pipeline(repo_path, include_docs=True):
-        print(result)
-
-    print("Here we have the repo under the repo_path")
-
-    await render_graph(None, include_labels=True, include_nodes=True)
-
+async def generate_patch_with_cognee(instance):
+    """repo_path = download_github_repo(instance, "../RAW_GIT_REPOS")"""
     problem_statement = instance["problem_statement"]
     instructions = read_query_prompt("patch_gen_kg_instructions.txt")
 
-    retrieved_edges = await brute_force_triplet_search(
-        problem_statement,
-        top_k=3,
-        collections=["code_summary_text"],
-    )
+    repo_path = "/Users/laszlohajdu/Documents/GitHub/test/"
+    async for result in run_code_graph_pipeline(repo_path, include_docs=False):
+        print(result)
 
-    retrieved_edges_str = retrieved_edges_to_string(retrieved_edges)
+    retrieved_codeparts = await code_description_to_code_part_search(problem_statement)
 
     prompt = "\n".join(
         [
@@ -60,7 +52,7 @@ async def generate_patch_with_cognee(instance, llm_client, search_type=SearchTyp
             PATCH_EXAMPLE,
             "</patch>",
             "These are the retrieved edges:",
-            retrieved_edges_str,
+            retrieved_codeparts,
         ]
     )
 
@@ -86,8 +78,6 @@ async def generate_patch_without_cognee(instance, llm_client):
 
 
 async def get_preds(dataset, with_cognee=True):
-    llm_client = get_llm_client()
-
     if with_cognee:
         model_name = "with_cognee"
         pred_func = generate_patch_with_cognee
@@ -95,17 +85,18 @@ async def get_preds(dataset, with_cognee=True):
         model_name = "without_cognee"
         pred_func = generate_patch_without_cognee
 
-    futures = [(instance["instance_id"], pred_func(instance, llm_client)) for instance in dataset]
-    model_patches = await asyncio.gather(*[x[1] for x in futures])
+    preds = []
 
-    preds = [
-        {
-            "instance_id": instance_id,
-            "model_patch": model_patch,
-            "model_name_or_path": model_name,
-        }
-        for (instance_id, _), model_patch in zip(futures, model_patches)
-    ]
+    for instance in dataset:
+        instance_id = instance["instance_id"]
+        model_patch = await pred_func(instance)  # Sequentially await the async function
+        preds.append(
+            {
+                "instance_id": instance_id,
+                "model_patch": model_patch,
+                "model_name_or_path": model_name,
+            }
+        )
 
     return preds
 
@@ -135,6 +126,7 @@ async def main():
         with open(predictions_path, "w") as file:
             json.dump(preds, file)
 
+    """ This part is for the evaluation
     subprocess.run(
         [
             "python",
@@ -152,6 +144,7 @@ async def main():
             "test_run",
         ]
     )
+    """
 
 
 if __name__ == "__main__":

From 8327c053990f9e71c19a087603865e3cbfa54a8c Mon Sep 17 00:00:00 2001
From: Rita Aleksziev <alekszievr@gmail.com>
Date: Fri, 10 Jan 2025 11:47:21 +0100
Subject: [PATCH 20/24] Match Ruff version in config to the one in github
 actions

---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 42f12ea51..96bfe6d32 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -10,7 +10,7 @@ repos:
     -   id: check-added-large-files
 - repo: https://github.com/astral-sh/ruff-pre-commit
   # Ruff version.
-  rev: v0.8.3
+  rev: v0.9.0
   hooks:
     # Run the linter.
     - id: ruff

From 6177d04b44e8e16ebd7070fb1b4f127e0aea4d6f Mon Sep 17 00:00:00 2001
From: hajdul88 <52442977+hajdul88@users.noreply.github.com>
Date: Fri, 10 Jan 2025 13:03:34 +0100
Subject: [PATCH 21/24] feat: implements code retreiver

---
 .../llm/prompts/patch_gen_kg_instructions.txt |  9 +++--
 .../description_to_codepart_search.py         | 38 +++++++++++++++++--
 evals/eval_swe_bench.py                       | 13 ++++---
 3 files changed, 48 insertions(+), 12 deletions(-)

diff --git a/cognee/infrastructure/llm/prompts/patch_gen_kg_instructions.txt b/cognee/infrastructure/llm/prompts/patch_gen_kg_instructions.txt
index ebbb03f75..3117ac9f1 100644
--- a/cognee/infrastructure/llm/prompts/patch_gen_kg_instructions.txt
+++ b/cognee/infrastructure/llm/prompts/patch_gen_kg_instructions.txt
@@ -1,3 +1,6 @@
-I need you to solve this issue by looking at the provided edges retrieved from a knowledge graph and 
-generate a single patch file that I can apply directly to this repository using git apply. 
-Please respond with a single patch file in the following format.
\ No newline at end of file
+You are a senior software engineer. I need you to solve this issue by looking at the provided context and
+generate a single patch file that I can apply directly to this repository using git apply.
+Additionally, please make sure that you provide code only with correct syntax and
+you apply the patch on the relevant files (together with their path that you can try to find out from the github issue). Don't change the names of existing
+functions or classes, as they may be referenced from other code.
+Please respond only with a single patch file in the following format without adding any additional context or string.
diff --git a/cognee/modules/retrieval/description_to_codepart_search.py b/cognee/modules/retrieval/description_to_codepart_search.py
index fec17fb16..538f76a6e 100644
--- a/cognee/modules/retrieval/description_to_codepart_search.py
+++ b/cognee/modules/retrieval/description_to_codepart_search.py
@@ -8,20 +8,27 @@ from cognee.modules.graph.cognee_graph.CogneeGraph import CogneeGraph
 from cognee.modules.users.methods import get_default_user
 from cognee.modules.users.models import User
 from cognee.shared.utils import send_telemetry
+from cognee.api.v1.search import SearchType
+from cognee.api.v1.search.search_v2 import search
+from cognee.infrastructure.llm.get_llm_client import get_llm_client
 
 
-async def code_description_to_code_part_search(query: str, user: User = None, top_k=5) -> list:
+async def code_description_to_code_part_search(
+    query: str, include_docs=False, user: User = None, top_k=5
+) -> list:
     if user is None:
         user = await get_default_user()
 
     if user is None:
         raise PermissionError("No user found in the system. Please create a user.")
 
-    retrieved_codeparts = await code_description_to_code_part(query, user, top_k)
+    retrieved_codeparts = await code_description_to_code_part(query, user, top_k, include_docs)
     return retrieved_codeparts
 
 
-async def code_description_to_code_part(query: str, user: User, top_k: int) -> List[str]:
+async def code_description_to_code_part(
+    query: str, user: User, top_k: int, include_docs: bool
+) -> List[str]:
     """
     Maps a code description query to relevant code parts using a CodeGraph pipeline.
 
@@ -29,6 +36,7 @@ async def code_description_to_code_part(query: str, user: User, top_k: int) -> L
         query (str): The search query describing the code parts.
         user (User): The user performing the search.
         top_k (int): Number of codegraph descriptions to match ( num of corresponding codeparts will be higher)
+        include_docs(bool): Boolean showing whether we have the docs in the graph or not
 
     Returns:
         Set[str]: A set of unique code parts matching the query.
@@ -37,6 +45,7 @@ async def code_description_to_code_part(query: str, user: User, top_k: int) -> L
         ValueError: If arguments are invalid.
         RuntimeError: If an unexpected error occurs during execution.
     """
+    print(include_docs)
     if not query or not isinstance(query, str):
         raise ValueError("The query must be a non-empty string.")
     if top_k <= 0 or not isinstance(top_k, int):
@@ -55,6 +64,26 @@ async def code_description_to_code_part(query: str, user: User, top_k: int) -> L
     )
 
     try:
+        if include_docs:
+            search_results = await search(SearchType.INSIGHTS, query_text=query)
+
+            concatenated_descriptions = " ".join(
+                obj["description"]
+                for tpl in search_results
+                for obj in tpl
+                if isinstance(obj, dict) and "description" in obj
+            )
+
+            llm_client = get_llm_client()
+            context_from_documents = await llm_client.acreate_structured_output(
+                text_input=f"The retrieved context from documents"
+                f" is {concatenated_descriptions}.",
+                system_prompt="You are a Senior Software Engineer, summarize the context from documents"
+                f" in a way that it is gonna be provided next to codeparts as context"
+                f" while trying to solve this github issue connected to the project: {query}]",
+                response_model=str,
+            )
+
         code_summaries = await vector_engine.search(
             "code_summary_text", query_text=query, limit=top_k
         )
@@ -102,6 +131,9 @@ async def code_description_to_code_part(query: str, user: User, top_k: int) -> L
         for code_piece in code_pieces_to_return:
             context = context + code_piece.get_attribute("source_code")
 
+        if include_docs:
+            context = context_from_documents + context
+
         return context
 
     except Exception as exec_error:
diff --git a/evals/eval_swe_bench.py b/evals/eval_swe_bench.py
index b5fcc616b..894acf1bb 100644
--- a/evals/eval_swe_bench.py
+++ b/evals/eval_swe_bench.py
@@ -14,8 +14,6 @@ from cognee.infrastructure.llm.prompts import read_query_prompt
 from cognee.modules.retrieval.description_to_codepart_search import (
     code_description_to_code_part_search,
 )
-from cognee.shared.utils import render_graph
-from evals.eval_utils import download_github_repo, retrieved_edges_to_string
 
 
 def check_install_package(package_name):
@@ -36,14 +34,17 @@ def check_install_package(package_name):
 
 async def generate_patch_with_cognee(instance):
     """repo_path = download_github_repo(instance, "../RAW_GIT_REPOS")"""
+    include_docs = True
     problem_statement = instance["problem_statement"]
     instructions = read_query_prompt("patch_gen_kg_instructions.txt")
 
-    repo_path = "/Users/laszlohajdu/Documents/GitHub/test/"
-    async for result in run_code_graph_pipeline(repo_path, include_docs=False):
+    repo_path = "/Users/laszlohajdu/Documents/GitHub/graph_rag/"
+    async for result in run_code_graph_pipeline(repo_path, include_docs=include_docs):
         print(result)
 
-    retrieved_codeparts = await code_description_to_code_part_search(problem_statement)
+    retrieved_codeparts = await code_description_to_code_part_search(
+        problem_statement, include_docs=include_docs
+    )
 
     prompt = "\n".join(
         [
@@ -51,7 +52,7 @@ async def generate_patch_with_cognee(instance):
             "<patch>",
             PATCH_EXAMPLE,
             "</patch>",
-            "These are the retrieved edges:",
+            "This is the additional context to solve the problem (description from documentation together with codeparts):",
             retrieved_codeparts,
         ]
     )

From 06e8d2268b231abb680a89ff86c2525d16599389 Mon Sep 17 00:00:00 2001
From: hajdul88 <52442977+hajdul88@users.noreply.github.com>
Date: Fri, 10 Jan 2025 13:52:26 +0100
Subject: [PATCH 22/24] Fix: fixes unit test for codepart search

---
 .../retrieval/description_to_codepart_search.py       | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/cognee/modules/retrieval/description_to_codepart_search.py b/cognee/modules/retrieval/description_to_codepart_search.py
index 538f76a6e..bd506f76a 100644
--- a/cognee/modules/retrieval/description_to_codepart_search.py
+++ b/cognee/modules/retrieval/description_to_codepart_search.py
@@ -27,7 +27,7 @@ async def code_description_to_code_part_search(
 
 
 async def code_description_to_code_part(
-    query: str, user: User, top_k: int, include_docs: bool
+    query: str, user: User, top_k: int, include_docs: bool = False
 ) -> List[str]:
     """
     Maps a code description query to relevant code parts using a CodeGraph pipeline.
@@ -45,7 +45,6 @@ async def code_description_to_code_part(
         ValueError: If arguments are invalid.
         RuntimeError: If an unexpected error occurs during execution.
     """
-    print(include_docs)
     if not query or not isinstance(query, str):
         raise ValueError("The query must be a non-empty string.")
     if top_k <= 0 or not isinstance(top_k, int):
@@ -94,7 +93,13 @@ async def code_description_to_code_part(
         memory_fragment = CogneeGraph()
         await memory_fragment.project_graph_from_db(
             graph_engine,
-            node_properties_to_project=["id", "type", "text", "source_code", "pydantic_type"],
+            node_properties_to_project=[
+                "id",
+                "type",
+                "text",
+                "source_code",
+                "pydantic_type",
+            ],
             edge_properties_to_project=["relationship_name"],
         )
 

From 872bc8964843894c3f122f767516ea8e8214e275 Mon Sep 17 00:00:00 2001
From: Rita Aleksziev <alekszievr@gmail.com>
Date: Fri, 10 Jan 2025 15:11:00 +0100
Subject: [PATCH 23/24] Format with Ruff 0.9.0

---
 .../databases/graph/neo4j_driver/adapter.py   |  2 +-
 .../retrieval/brute_force_triplet_search.py   |  2 +-
 .../description_to_codepart_search.py         |  3 +-
 .../documents/AudioDocument_test.py           | 18 +++++-----
 .../documents/ImageDocument_test.py           | 18 +++++-----
 .../integration/documents/PdfDocument_test.py | 18 +++++-----
 .../documents/TextDocument_test.py            | 18 +++++-----
 .../documents/UnstructuredDocument_test.py    | 30 ++++++++--------
 cognee/tests/test_deduplication.py            | 12 +++----
 cognee/tests/test_falkordb.py                 |  6 ++--
 cognee/tests/test_library.py                  |  6 ++--
 cognee/tests/test_pgvector.py                 | 36 +++++++++----------
 .../chunks/chunk_by_paragraph_2_test.py       | 18 +++++-----
 .../chunks/chunk_by_paragraph_test.py         |  6 ++--
 .../chunks/chunk_by_sentence_test.py          | 12 +++----
 .../processing/chunks/chunk_by_word_test.py   |  6 ++--
 16 files changed, 105 insertions(+), 106 deletions(-)

diff --git a/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py b/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py
index 5490f6b43..3543418fc 100644
--- a/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py
+++ b/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py
@@ -493,7 +493,7 @@ class Neo4jAdapter(GraphDBInterface):
 
         query_edges = f"""
         MATCH (n)-[r]->(m)
-        WHERE {where_clause} AND {where_clause.replace('n.', 'm.')}
+        WHERE {where_clause} AND {where_clause.replace("n.", "m.")}
         RETURN ID(n) AS source, ID(m) AS target, TYPE(r) AS type, properties(r) AS properties
         """
         result_edges = await self.query(query_edges)
diff --git a/cognee/modules/retrieval/brute_force_triplet_search.py b/cognee/modules/retrieval/brute_force_triplet_search.py
index fdd312480..9c778505d 100644
--- a/cognee/modules/retrieval/brute_force_triplet_search.py
+++ b/cognee/modules/retrieval/brute_force_triplet_search.py
@@ -43,7 +43,7 @@ def format_triplets(edges):
         edge_info = {key: value for key, value in edge_attributes.items() if value is not None}
 
         # Create the formatted triplet
-        triplet = f"Node1: {node1_info}\n" f"Edge: {edge_info}\n" f"Node2: {node2_info}\n\n\n"
+        triplet = f"Node1: {node1_info}\nEdge: {edge_info}\nNode2: {node2_info}\n\n\n"
         triplets.append(triplet)
 
     return "".join(triplets)
diff --git a/cognee/modules/retrieval/description_to_codepart_search.py b/cognee/modules/retrieval/description_to_codepart_search.py
index bd506f76a..243fdbde3 100644
--- a/cognee/modules/retrieval/description_to_codepart_search.py
+++ b/cognee/modules/retrieval/description_to_codepart_search.py
@@ -75,8 +75,7 @@ async def code_description_to_code_part(
 
             llm_client = get_llm_client()
             context_from_documents = await llm_client.acreate_structured_output(
-                text_input=f"The retrieved context from documents"
-                f" is {concatenated_descriptions}.",
+                text_input=f"The retrieved context from documents is {concatenated_descriptions}.",
                 system_prompt="You are a Senior Software Engineer, summarize the context from documents"
                 f" in a way that it is gonna be provided next to codeparts as context"
                 f" while trying to solve this github issue connected to the project: {query}]",
diff --git a/cognee/tests/integration/documents/AudioDocument_test.py b/cognee/tests/integration/documents/AudioDocument_test.py
index dbd43ddda..e07a2431b 100644
--- a/cognee/tests/integration/documents/AudioDocument_test.py
+++ b/cognee/tests/integration/documents/AudioDocument_test.py
@@ -36,12 +36,12 @@ def test_AudioDocument():
         for ground_truth, paragraph_data in zip(
             GROUND_TRUTH, document.read(chunk_size=64, chunker="text_chunker")
         ):
-            assert (
-                ground_truth["word_count"] == paragraph_data.word_count
-            ), f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }'
-            assert ground_truth["len_text"] == len(
-                paragraph_data.text
-            ), f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }'
-            assert (
-                ground_truth["cut_type"] == paragraph_data.cut_type
-            ), f'{ground_truth["cut_type"] = } != {paragraph_data.cut_type = }'
+            assert ground_truth["word_count"] == paragraph_data.word_count, (
+                f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }'
+            )
+            assert ground_truth["len_text"] == len(paragraph_data.text), (
+                f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }'
+            )
+            assert ground_truth["cut_type"] == paragraph_data.cut_type, (
+                f'{ground_truth["cut_type"] = } != {paragraph_data.cut_type = }'
+            )
diff --git a/cognee/tests/integration/documents/ImageDocument_test.py b/cognee/tests/integration/documents/ImageDocument_test.py
index c0877ae99..b8d585419 100644
--- a/cognee/tests/integration/documents/ImageDocument_test.py
+++ b/cognee/tests/integration/documents/ImageDocument_test.py
@@ -25,12 +25,12 @@ def test_ImageDocument():
         for ground_truth, paragraph_data in zip(
             GROUND_TRUTH, document.read(chunk_size=64, chunker="text_chunker")
         ):
-            assert (
-                ground_truth["word_count"] == paragraph_data.word_count
-            ), f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }'
-            assert ground_truth["len_text"] == len(
-                paragraph_data.text
-            ), f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }'
-            assert (
-                ground_truth["cut_type"] == paragraph_data.cut_type
-            ), f'{ground_truth["cut_type"] = } != {paragraph_data.cut_type = }'
+            assert ground_truth["word_count"] == paragraph_data.word_count, (
+                f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }'
+            )
+            assert ground_truth["len_text"] == len(paragraph_data.text), (
+                f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }'
+            )
+            assert ground_truth["cut_type"] == paragraph_data.cut_type, (
+                f'{ground_truth["cut_type"] = } != {paragraph_data.cut_type = }'
+            )
diff --git a/cognee/tests/integration/documents/PdfDocument_test.py b/cognee/tests/integration/documents/PdfDocument_test.py
index 8f28815d3..fc4307846 100644
--- a/cognee/tests/integration/documents/PdfDocument_test.py
+++ b/cognee/tests/integration/documents/PdfDocument_test.py
@@ -27,12 +27,12 @@ def test_PdfDocument():
     for ground_truth, paragraph_data in zip(
         GROUND_TRUTH, document.read(chunk_size=1024, chunker="text_chunker")
     ):
-        assert (
-            ground_truth["word_count"] == paragraph_data.word_count
-        ), f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }'
-        assert ground_truth["len_text"] == len(
-            paragraph_data.text
-        ), f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }'
-        assert (
-            ground_truth["cut_type"] == paragraph_data.cut_type
-        ), f'{ground_truth["cut_type"] = } != {paragraph_data.cut_type = }'
+        assert ground_truth["word_count"] == paragraph_data.word_count, (
+            f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }'
+        )
+        assert ground_truth["len_text"] == len(paragraph_data.text), (
+            f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }'
+        )
+        assert ground_truth["cut_type"] == paragraph_data.cut_type, (
+            f'{ground_truth["cut_type"] = } != {paragraph_data.cut_type = }'
+        )
diff --git a/cognee/tests/integration/documents/TextDocument_test.py b/cognee/tests/integration/documents/TextDocument_test.py
index 1e143d563..6daec62b7 100644
--- a/cognee/tests/integration/documents/TextDocument_test.py
+++ b/cognee/tests/integration/documents/TextDocument_test.py
@@ -39,12 +39,12 @@ def test_TextDocument(input_file, chunk_size):
     for ground_truth, paragraph_data in zip(
         GROUND_TRUTH[input_file], document.read(chunk_size=chunk_size, chunker="text_chunker")
     ):
-        assert (
-            ground_truth["word_count"] == paragraph_data.word_count
-        ), f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }'
-        assert ground_truth["len_text"] == len(
-            paragraph_data.text
-        ), f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }'
-        assert (
-            ground_truth["cut_type"] == paragraph_data.cut_type
-        ), f'{ground_truth["cut_type"] = } != {paragraph_data.cut_type = }'
+        assert ground_truth["word_count"] == paragraph_data.word_count, (
+            f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }'
+        )
+        assert ground_truth["len_text"] == len(paragraph_data.text), (
+            f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }'
+        )
+        assert ground_truth["cut_type"] == paragraph_data.cut_type, (
+            f'{ground_truth["cut_type"] = } != {paragraph_data.cut_type = }'
+        )
diff --git a/cognee/tests/integration/documents/UnstructuredDocument_test.py b/cognee/tests/integration/documents/UnstructuredDocument_test.py
index e0278de81..773dc2293 100644
--- a/cognee/tests/integration/documents/UnstructuredDocument_test.py
+++ b/cognee/tests/integration/documents/UnstructuredDocument_test.py
@@ -71,32 +71,32 @@ def test_UnstructuredDocument():
     for paragraph_data in pptx_document.read(chunk_size=1024, chunker="text_chunker"):
         assert 19 == paragraph_data.word_count, f" 19 != {paragraph_data.word_count = }"
         assert 104 == len(paragraph_data.text), f" 104 != {len(paragraph_data.text) = }"
-        assert (
-            "sentence_cut" == paragraph_data.cut_type
-        ), f" sentence_cut != {paragraph_data.cut_type = }"
+        assert "sentence_cut" == paragraph_data.cut_type, (
+            f" sentence_cut != {paragraph_data.cut_type = }"
+        )
 
     # Test DOCX
     for paragraph_data in docx_document.read(chunk_size=1024, chunker="text_chunker"):
         assert 16 == paragraph_data.word_count, f" 16 != {paragraph_data.word_count = }"
         assert 145 == len(paragraph_data.text), f" 145 != {len(paragraph_data.text) = }"
-        assert (
-            "sentence_end" == paragraph_data.cut_type
-        ), f" sentence_end != {paragraph_data.cut_type = }"
+        assert "sentence_end" == paragraph_data.cut_type, (
+            f" sentence_end != {paragraph_data.cut_type = }"
+        )
 
     # TEST CSV
     for paragraph_data in csv_document.read(chunk_size=1024, chunker="text_chunker"):
         assert 15 == paragraph_data.word_count, f" 15 != {paragraph_data.word_count = }"
-        assert (
-            "A A A A A A A A A,A A A A A A,A A" == paragraph_data.text
-        ), f"Read text doesn't match expected text: {paragraph_data.text}"
-        assert (
-            "sentence_cut" == paragraph_data.cut_type
-        ), f" sentence_cut != {paragraph_data.cut_type = }"
+        assert "A A A A A A A A A,A A A A A A,A A" == paragraph_data.text, (
+            f"Read text doesn't match expected text: {paragraph_data.text}"
+        )
+        assert "sentence_cut" == paragraph_data.cut_type, (
+            f" sentence_cut != {paragraph_data.cut_type = }"
+        )
 
     # Test XLSX
     for paragraph_data in xlsx_document.read(chunk_size=1024, chunker="text_chunker"):
         assert 36 == paragraph_data.word_count, f" 36 != {paragraph_data.word_count = }"
         assert 171 == len(paragraph_data.text), f" 171 != {len(paragraph_data.text) = }"
-        assert (
-            "sentence_cut" == paragraph_data.cut_type
-        ), f" sentence_cut != {paragraph_data.cut_type = }"
+        assert "sentence_cut" == paragraph_data.cut_type, (
+            f" sentence_cut != {paragraph_data.cut_type = }"
+        )
diff --git a/cognee/tests/test_deduplication.py b/cognee/tests/test_deduplication.py
index 9c2df032d..89c866f12 100644
--- a/cognee/tests/test_deduplication.py
+++ b/cognee/tests/test_deduplication.py
@@ -30,9 +30,9 @@ async def test_deduplication():
 
     result = await relational_engine.get_all_data_from_table("data")
     assert len(result) == 1, "More than one data entity was found."
-    assert (
-        result[0]["name"] == "Natural_language_processing_copy"
-    ), "Result name does not match expected value."
+    assert result[0]["name"] == "Natural_language_processing_copy", (
+        "Result name does not match expected value."
+    )
 
     result = await relational_engine.get_all_data_from_table("datasets")
     assert len(result) == 2, "Unexpected number of datasets found."
@@ -61,9 +61,9 @@ async def test_deduplication():
 
     result = await relational_engine.get_all_data_from_table("data")
     assert len(result) == 1, "More than one data entity was found."
-    assert (
-        hashlib.md5(text.encode("utf-8")).hexdigest() in result[0]["name"]
-    ), "Content hash is not a part of file name."
+    assert hashlib.md5(text.encode("utf-8")).hexdigest() in result[0]["name"], (
+        "Content hash is not a part of file name."
+    )
 
     await cognee.prune.prune_data()
     await cognee.prune.prune_system(metadata=True)
diff --git a/cognee/tests/test_falkordb.py b/cognee/tests/test_falkordb.py
index 07ece9eb2..af0e87916 100755
--- a/cognee/tests/test_falkordb.py
+++ b/cognee/tests/test_falkordb.py
@@ -85,9 +85,9 @@ async def main():
 
     from cognee.infrastructure.databases.relational import get_relational_engine
 
-    assert not os.path.exists(
-        get_relational_engine().db_path
-    ), "SQLite relational database is not empty"
+    assert not os.path.exists(get_relational_engine().db_path), (
+        "SQLite relational database is not empty"
+    )
 
     from cognee.infrastructure.databases.graph import get_graph_config
 
diff --git a/cognee/tests/test_library.py b/cognee/tests/test_library.py
index 8352b4161..192b67506 100755
--- a/cognee/tests/test_library.py
+++ b/cognee/tests/test_library.py
@@ -82,9 +82,9 @@ async def main():
 
     from cognee.infrastructure.databases.relational import get_relational_engine
 
-    assert not os.path.exists(
-        get_relational_engine().db_path
-    ), "SQLite relational database is not empty"
+    assert not os.path.exists(get_relational_engine().db_path), (
+        "SQLite relational database is not empty"
+    )
 
     from cognee.infrastructure.databases.graph import get_graph_config
 
diff --git a/cognee/tests/test_pgvector.py b/cognee/tests/test_pgvector.py
index c241177f0..73b6be974 100644
--- a/cognee/tests/test_pgvector.py
+++ b/cognee/tests/test_pgvector.py
@@ -24,28 +24,28 @@ async def test_local_file_deletion(data_text, file_location):
         data_hash = hashlib.md5(encoded_text).hexdigest()
         # Get data entry from database based on hash contents
         data = (await session.scalars(select(Data).where(Data.content_hash == data_hash))).one()
-        assert os.path.isfile(
-            data.raw_data_location
-        ), f"Data location doesn't exist: {data.raw_data_location}"
+        assert os.path.isfile(data.raw_data_location), (
+            f"Data location doesn't exist: {data.raw_data_location}"
+        )
         # Test deletion of data along with local files created by cognee
         await engine.delete_data_entity(data.id)
-        assert not os.path.exists(
-            data.raw_data_location
-        ), f"Data location still exists after deletion: {data.raw_data_location}"
+        assert not os.path.exists(data.raw_data_location), (
+            f"Data location still exists after deletion: {data.raw_data_location}"
+        )
 
     async with engine.get_async_session() as session:
         # Get data entry from database based on file path
         data = (
             await session.scalars(select(Data).where(Data.raw_data_location == file_location))
         ).one()
-        assert os.path.isfile(
-            data.raw_data_location
-        ), f"Data location doesn't exist: {data.raw_data_location}"
+        assert os.path.isfile(data.raw_data_location), (
+            f"Data location doesn't exist: {data.raw_data_location}"
+        )
         # Test local files not created by cognee won't get deleted
         await engine.delete_data_entity(data.id)
-        assert os.path.exists(
-            data.raw_data_location
-        ), f"Data location doesn't exists: {data.raw_data_location}"
+        assert os.path.exists(data.raw_data_location), (
+            f"Data location doesn't exists: {data.raw_data_location}"
+        )
 
 
 async def test_getting_of_documents(dataset_name_1):
@@ -54,16 +54,16 @@ async def test_getting_of_documents(dataset_name_1):
 
     user = await get_default_user()
     document_ids = await get_document_ids_for_user(user.id, [dataset_name_1])
-    assert (
-        len(document_ids) == 1
-    ), f"Number of expected documents doesn't match {len(document_ids)} != 1"
+    assert len(document_ids) == 1, (
+        f"Number of expected documents doesn't match {len(document_ids)} != 1"
+    )
 
     # Test getting of documents for search when no dataset is provided
     user = await get_default_user()
     document_ids = await get_document_ids_for_user(user.id)
-    assert (
-        len(document_ids) == 2
-    ), f"Number of expected documents doesn't match {len(document_ids)} != 2"
+    assert len(document_ids) == 2, (
+        f"Number of expected documents doesn't match {len(document_ids)} != 2"
+    )
 
 
 async def main():
diff --git a/cognee/tests/unit/processing/chunks/chunk_by_paragraph_2_test.py b/cognee/tests/unit/processing/chunks/chunk_by_paragraph_2_test.py
index 53098fc67..d8680a604 100644
--- a/cognee/tests/unit/processing/chunks/chunk_by_paragraph_2_test.py
+++ b/cognee/tests/unit/processing/chunks/chunk_by_paragraph_2_test.py
@@ -17,9 +17,9 @@ batch_paragraphs_vals = [True, False]
 def test_chunk_by_paragraph_isomorphism(input_text, paragraph_length, batch_paragraphs):
     chunks = chunk_by_paragraph(input_text, paragraph_length, batch_paragraphs)
     reconstructed_text = "".join([chunk["text"] for chunk in chunks])
-    assert (
-        reconstructed_text == input_text
-    ), f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }"
+    assert reconstructed_text == input_text, (
+        f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }"
+    )
 
 
 @pytest.mark.parametrize(
@@ -36,9 +36,9 @@ def test_paragraph_chunk_length(input_text, paragraph_length, batch_paragraphs):
     chunk_lengths = np.array([len(list(chunk_by_word(chunk["text"]))) for chunk in chunks])
 
     larger_chunks = chunk_lengths[chunk_lengths > paragraph_length]
-    assert np.all(
-        chunk_lengths <= paragraph_length
-    ), f"{paragraph_length = }: {larger_chunks} are too large"
+    assert np.all(chunk_lengths <= paragraph_length), (
+        f"{paragraph_length = }: {larger_chunks} are too large"
+    )
 
 
 @pytest.mark.parametrize(
@@ -50,6 +50,6 @@ def test_chunk_by_paragraph_chunk_numbering(input_text, paragraph_length, batch_
         data=input_text, paragraph_length=paragraph_length, batch_paragraphs=batch_paragraphs
     )
     chunk_indices = np.array([chunk["chunk_index"] for chunk in chunks])
-    assert np.all(
-        chunk_indices == np.arange(len(chunk_indices))
-    ), f"{chunk_indices = } are not monotonically increasing"
+    assert np.all(chunk_indices == np.arange(len(chunk_indices))), (
+        f"{chunk_indices = } are not monotonically increasing"
+    )
diff --git a/cognee/tests/unit/processing/chunks/chunk_by_paragraph_test.py b/cognee/tests/unit/processing/chunks/chunk_by_paragraph_test.py
index e7d9a54ba..e420b2e9f 100644
--- a/cognee/tests/unit/processing/chunks/chunk_by_paragraph_test.py
+++ b/cognee/tests/unit/processing/chunks/chunk_by_paragraph_test.py
@@ -58,9 +58,9 @@ def run_chunking_test(test_text, expected_chunks):
 
     for expected_chunks_item, chunk in zip(expected_chunks, chunks):
         for key in ["text", "word_count", "cut_type"]:
-            assert (
-                chunk[key] == expected_chunks_item[key]
-            ), f"{key = }: {chunk[key] = } != {expected_chunks_item[key] = }"
+            assert chunk[key] == expected_chunks_item[key], (
+                f"{key = }: {chunk[key] = } != {expected_chunks_item[key] = }"
+            )
 
 
 def test_chunking_whole_text():
diff --git a/cognee/tests/unit/processing/chunks/chunk_by_sentence_test.py b/cognee/tests/unit/processing/chunks/chunk_by_sentence_test.py
index d1c75d7ed..efa053077 100644
--- a/cognee/tests/unit/processing/chunks/chunk_by_sentence_test.py
+++ b/cognee/tests/unit/processing/chunks/chunk_by_sentence_test.py
@@ -16,9 +16,9 @@ maximum_length_vals = [None, 8, 64]
 def test_chunk_by_sentence_isomorphism(input_text, maximum_length):
     chunks = chunk_by_sentence(input_text, maximum_length)
     reconstructed_text = "".join([chunk[1] for chunk in chunks])
-    assert (
-        reconstructed_text == input_text
-    ), f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }"
+    assert reconstructed_text == input_text, (
+        f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }"
+    )
 
 
 @pytest.mark.parametrize(
@@ -36,6 +36,6 @@ def test_paragraph_chunk_length(input_text, maximum_length):
     chunk_lengths = np.array([len(list(chunk_by_word(chunk[1]))) for chunk in chunks])
 
     larger_chunks = chunk_lengths[chunk_lengths > maximum_length]
-    assert np.all(
-        chunk_lengths <= maximum_length
-    ), f"{maximum_length = }: {larger_chunks} are too large"
+    assert np.all(chunk_lengths <= maximum_length), (
+        f"{maximum_length = }: {larger_chunks} are too large"
+    )
diff --git a/cognee/tests/unit/processing/chunks/chunk_by_word_test.py b/cognee/tests/unit/processing/chunks/chunk_by_word_test.py
index fb26638cb..d79fcdbc8 100644
--- a/cognee/tests/unit/processing/chunks/chunk_by_word_test.py
+++ b/cognee/tests/unit/processing/chunks/chunk_by_word_test.py
@@ -17,9 +17,9 @@ from cognee.tests.unit.processing.chunks.test_input import INPUT_TEXTS
 def test_chunk_by_word_isomorphism(input_text):
     chunks = chunk_by_word(input_text)
     reconstructed_text = "".join([chunk[0] for chunk in chunks])
-    assert (
-        reconstructed_text == input_text
-    ), f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }"
+    assert reconstructed_text == input_text, (
+        f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }"
+    )
 
 
 @pytest.mark.parametrize(

From e2ad54d88e93bed169182feb212fbcb9a3fb86f1 Mon Sep 17 00:00:00 2001
From: hajdul88 <52442977+hajdul88@users.noreply.github.com>
Date: Fri, 10 Jan 2025 15:54:45 +0100
Subject: [PATCH 24/24] Fix: deleting incorrect repo path

---
 evals/eval_swe_bench.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/evals/eval_swe_bench.py b/evals/eval_swe_bench.py
index 894acf1bb..a8b4c8a1d 100644
--- a/evals/eval_swe_bench.py
+++ b/evals/eval_swe_bench.py
@@ -14,6 +14,7 @@ from cognee.infrastructure.llm.prompts import read_query_prompt
 from cognee.modules.retrieval.description_to_codepart_search import (
     code_description_to_code_part_search,
 )
+from evals.eval_utils import download_github_repo, retrieved_edges_to_string
 
 
 def check_install_package(package_name):
@@ -33,12 +34,11 @@ def check_install_package(package_name):
 
 
 async def generate_patch_with_cognee(instance):
-    """repo_path = download_github_repo(instance, "../RAW_GIT_REPOS")"""
+    repo_path = download_github_repo(instance, "../RAW_GIT_REPOS")
     include_docs = True
     problem_statement = instance["problem_statement"]
     instructions = read_query_prompt("patch_gen_kg_instructions.txt")
 
-    repo_path = "/Users/laszlohajdu/Documents/GitHub/graph_rag/"
     async for result in run_code_graph_pipeline(repo_path, include_docs=include_docs):
         print(result)