Merge branch 'dev' of https://github.com/topoteretes/cognee into dev

2025-01-06 10:21:04 +01:00 · 2025-01-06 10:21:04 +01:00 · a6dfff883c
commit a6dfff883c
parent 399faf9ca0 fe672ce0e4
32 changed files with 5419 additions and 251 deletions
--- a/.github/workflows/ruff_format.yaml
+++ b/.github/workflows/ruff_format.yaml
@ -0,0 +1,11 @@
+name: ruff format
+on: [ pull_request ]
+
+jobs:
+  ruff:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: astral-sh/ruff-action@v2
+        with:
+          args: "format --check"
--- a/.github/workflows/ruff_lint.yaml
+++ b/.github/workflows/ruff_lint.yaml
@ -0,0 +1,9 @@
+name: ruff lint
+on: [ pull_request ]
+
+jobs:
+  ruff:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: astral-sh/ruff-action@v2
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -0,0 +1,20 @@
+# See https://pre-commit.com for more information
+# See https://pre-commit.com/hooks.html for more hooks
+repos:
+-   repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v3.2.0
+    hooks:
+    -   id: trailing-whitespace
+    -   id: end-of-file-fixer
+    -   id: check-yaml
+    -   id: check-added-large-files
+- repo: https://github.com/astral-sh/ruff-pre-commit
+  # Ruff version.
+  rev: v0.8.3
+  hooks:
+    # Run the linter.
+    - id: ruff
+      types_or: [ python, pyi ]
+    # Run the formatter.
+    - id: ruff-format
+      types_or: [ python, pyi ]
--- a/README.md
+++ b/README.md
@ -17,6 +17,9 @@ Try it in a Google Colab  <a href="https://colab.research.google.com/drive/1g-Qn

 If you have questions, join our  <a href="https://discord.gg/NQPKmU5CCg">Discord</a> community

+<div align="center">
+<img src="assets/cognee_benefits.png" alt="why cognee" width="80%" />
+</div>

 ## 📦 Installation

@ -193,93 +196,14 @@ if __name__ == '__main__':
 When you run this script, you will see step-by-step messages in the console that help you trace the execution flow and understand what the script is doing at each stage.
 A version of this example is here: `examples/python/simple_example.py`

-### Create your own memory store
+### Understand our architecture

 cognee framework consists of tasks that can be grouped into pipelines.
 Each task can be an independent part of business logic, that can be tied to other tasks to form a pipeline.
 These tasks persist data into your memory store enabling you to search for relevant context of past conversations, documents, or any other data you have stored.
-
-
-### Example: Classify your documents
-
-Here is an example of how it looks for a default cognify pipeline:
-
-1. To prepare the data for the pipeline run, first we need to add it to our metastore and normalize it:
-
-Start with:
-```
-text = """Natural language processing (NLP) is an interdisciplinary
-       subfield of computer science and information retrieval"""
-
-await cognee.add(text) # Add a new piece of information
-```
-
-2. In the next step we make a task. The task can be any business logic we need, but the important part is that it should be encapsulated in one function.
-
-Here we show an example of creating a naive LLM classifier that takes a Pydantic model and then stores the data in both the graph and vector stores after analyzing each chunk.
-We provided just a snippet for reference, but feel free to check out the implementation in our repo. 
-
-```
-async def chunk_naive_llm_classifier(
-    data_chunks: list[DocumentChunk],
-    classification_model: Type[BaseModel]
-):
-    # Extract classifications asynchronously
-    chunk_classifications = await asyncio.gather(
-        *(extract_categories(chunk.text, classification_model) for chunk in data_chunks)
-    )
-
-    # Collect classification data points using a set to avoid duplicates
-    classification_data_points = {
-        uuid5(NAMESPACE_OID, cls.label.type)
-        for cls in chunk_classifications
-    } | {
-        uuid5(NAMESPACE_OID, subclass.value)
-        for cls in chunk_classifications
-        for subclass in cls.label.subclass
-    }
-
-    vector_engine = get_vector_engine()
-    collection_name = "classification"
-
-    # Define the payload schema
-    class Keyword(BaseModel):
-        uuid: str
-        text: str
-        chunk_id: str
-        document_id: str
-
-    # Ensure the collection exists and retrieve existing data points
-    if not await vector_engine.has_collection(collection_name):
-        await vector_engine.create_collection(collection_name, payload_schema=Keyword)
-        existing_points_map = {}
-    else:
-        existing_points_map = {}
-    return data_chunks
-
-...
-
-```
-
-We have many tasks that can be used in your pipelines, and you can also create your  tasks to fit your business logic.
-
-
-3. Once we have our tasks, it is time to group them into a pipeline.
-This simplified snippet demonstrates how tasks can be added to a pipeline, and how they can pass the information forward from one to another. 
-
-```
-            
-
-Task(
-    chunk_naive_llm_classifier,
-    classification_model = cognee_config.classification_model,
-)
-
-pipeline = run_tasks(tasks, documents)
-
-```
-
-To see the working code, check cognee.api.v1.cognify default pipeline in our repo.
+<div align="center">
+<img src="assets/cognee_diagram.png" alt="cognee concept diagram" width="50%" />
+</div>


 ## Vector retrieval, Graphs and LLMs
@ -338,11 +262,7 @@ pip install cognee

 ## Vector & Graph Databases Implementation State

-<style>
-  table {
-    width: 100%;
-  }
-</style>
+

 | Name     | Type               | Current state     | Known Issues |
 |----------|--------------------|-------------------|--------------|
@ -353,4 +273,4 @@ pip install cognee
 | NetworkX | Graph              | Stable &#x2705;   |              |
 | FalkorDB | Vector/Graph       | Unstable &#x274C; |              |
 | PGVector | Vector             | Stable &#x2705;   |              |
-| Milvus   | Vector             | Stable &#x2705;   |              |
+| Milvus   | Vector             | Stable &#x2705;   |              |
--- a/assets/architecture.png
+++ b/assets/architecture.png
--- a/assets/cognee_benefits.png
+++ b/assets/cognee_benefits.png
--- a/assets/cognee_diagram.png
+++ b/assets/cognee_diagram.png
--- a/cognee-mcp/README.md
+++ b/cognee-mcp/README.md
@ -1,57 +1,85 @@
 # cognee MCP server

+
+
+
+### Installing Manually
 A MCP server project
+=======
+1. Clone the [cognee](www.github.com/topoteretes/cognee) repo

-Create a boilerplate server:

-```jsx
-uvx create-mcp-server
+
+2. Install dependencies
+
+```
+pip install uv
+```
+```
+brew install postgresql
 ```

-1. The command will ask you to name your server, e.g. mcp_cognee
-
-
-2. Answer “Y” to connect with Claude
-Then run
+```
+brew install rust
+```

 ```jsx
-cd mcp_cognee
+cd cognee-mcp
 uv sync --dev --all-extras
 ```

-Activate the venv with
+3. Activate the venv with

 ```jsx
 source .venv/bin/activate
 ```

-This should already add the new server to your Claude config, but if not, add these lines manually:
+4. Add the new server to your Claude config:
+
+The file should be located here: ~/Library/Application\ Support/Claude/
+You need to create claude_desktop_config.json in this folder if it doesn't exist

 ```
-"mcpcognee": {
-      "command": "uv",
-      "args": [
+
+
+{
+	"mcpServers": {
+		"cognee": {
+			"command": "/Users/{user}/cognee/.venv/bin/uv",
+			"args": [
        "--directory",
-        "/Users/your_username/mcp/mcp_cognee",
+        "/Users/{user}/cognee/cognee-mcp",
        "run",
-        "mcpcognee"
+        "cognee"
      ],
      "env": {
        "ENV": "local",
        "TOKENIZERS_PARALLELISM": "false",
-        "LLM_API_KEY": "add_your_api_key_here",
-        "GRAPH_DATABASE_PROVIDER": "neo4j",
-        "GRAPH_DATABASE_URL": "bolt://localhost:7687",
-        "GRAPH_DATABASE_USERNAME": "add_username_here",
-        "GRAPH_DATABASE_PASSWORD": "add_pwd_here",
-        "VECTOR_DB_PROVIDER": "lancedb",
-        "DB_PROVIDER": "sqlite",
-        "DB_NAME": "postgres"
+        "LLM_API_KEY": "sk-"
      }
+		},
+    "filesystem": {
+      "command": "npx",
+      "args": [
+        "-y",
+        "@modelcontextprotocol/server-filesystem",
+        "/Users/{user}/Desktop",
+        "/Users/{user}/Projects"
+      ]
    }
+	}
+}
 ```

-Then, edit the pyproject.toml in your new folder so that it includes packages from the cognee requirements. Use the pyproject.toml in your cognee library for this, but match the syntax of the automatically generated pyproject.toml so that it is compatible with uv.
+Restart your Claude desktop.
+
+### Installing via Smithery
+
+To install Cognee for Claude Desktop automatically via [Smithery](https://smithery.ai/server/cognee):
+
+```bash
+npx -y @smithery/cli install cognee --client claude
+```

 Define cognify tool in server.py
-Restart your Claude desktop.
+Restart your Claude desktop.
--- a/cognee-mcp/cognee_mcp/init.py
+++ b/cognee-mcp/cognee_mcp/init.py
@ -8,7 +8,7 @@ def main():
    asyncio.run(server.main())

 # Optionally expose other important items at package level
-__all__ = ['main', 'server']
+__all__ = ["main", "server"]

 if __name__ == "__main__":
    main()
--- a/cognee-mcp/cognee_mcp/server.py
+++ b/cognee-mcp/cognee_mcp/server.py
@ -1,5 +1,6 @@
 import importlib.util
 import os
+import asyncio
 from contextlib import redirect_stderr, redirect_stdout

 import cognee
@ -9,15 +10,17 @@ from cognee.api.v1.search import SearchType
 from cognee.shared.data_models import KnowledgeGraph
 from mcp.server import NotificationOptions, Server
 from mcp.server.models import InitializationOptions
-from pydantic import AnyUrl, BaseModel

-server = Server("mcpcognee")
+server = Server("cognee-mcp")


 def node_to_string(node):
-    keys_to_keep = ["chunk_index", "topological_rank", "cut_type", "id", "text"]
-    keyset = set(keys_to_keep) & node.keys()
-    return "Node(" + " ".join([key + ": " + str(node[key]) + "," for key in keyset]) + ")"
+    # keys_to_keep = ["chunk_index", "topological_rank", "cut_type", "id", "text"]
+    # keyset = set(keys_to_keep) & node.keys()
+    # return "Node(" + " ".join([key + ": " + str(node[key]) + "," for key in keyset]) + ")"
+    node_data = ", ".join([f"{key}: \"{value}\"" for key, value in node.items() if key in ["id", "name"]])
+
+    return f"Node({node_data})"


 def retrieved_edges_to_string(search_results):
@ -49,60 +52,107 @@ async def handle_list_tools() -> list[types.Tool]:
    """
    return [
        types.Tool(
-            name="Cognify_and_search",
-            description="Build knowledge graph from the input text and search in it.",
-            inputSchema={
+            name = "cognify",
+            description = "Build knowledge graph from the input text.",
+            inputSchema = {
                "type": "object",
                "properties": {
                    "text": {"type": "string"},
-                    "search_query": {"type": "string"},
                    "graph_model_file": {"type": "string"},
                    "graph_model_name": {"type": "string"},
                },
-                "required": ["text", "search_query"],
+                "required": ["text"],
            },
-        )
+        ),
+        types.Tool(
+            name = "search",
+            description = "Search the knowledge graph.",
+            inputSchema = {
+                "type": "object",
+                "properties": {
+                    "query": {"type": "string"},
+                },
+                "required": ["query"],
+            },
+        ),
+        types.Tool(
+            name = "prune",
+            description = "Reset the knowledge graph.",
+            inputSchema = {
+                "type": "object",
+                "properties": {
+                    "query": {"type": "string"},
+                },
+            },
+        ),
    ]


@server.call_tool()
 async def handle_call_tool(
-        name: str, arguments: dict | None
+    name: str,
+    arguments: dict | None
 ) -> list[types.TextContent | types.ImageContent | types.EmbeddedResource]:
    """
    Handle tool execution requests.
    Tools can modify server state and notify clients of changes.
    """
-    if name == "Cognify_and_search":
+    if name == "cognify":
        with open(os.devnull, "w") as fnull:
            with redirect_stdout(fnull), redirect_stderr(fnull):
-                await cognee.prune.prune_data()
-                await cognee.prune.prune_system(metadata=True)
-
                if not arguments:
                    raise ValueError("Missing arguments")

                text = arguments.get("text")
-                search_query = arguments.get("search_query")
+
                if ("graph_model_file" in arguments) and ("graph_model_name" in arguments):
                    model_file = arguments.get("graph_model_file")
                    model_name = arguments.get("graph_model_name")
+
                    graph_model = load_class(model_file, model_name)
                else:
                    graph_model = KnowledgeGraph

                await cognee.add(text)
-                await cognee.cognify(graph_model=graph_model)
+
+                await cognee.cognify(graph_model = graph_model)
+
+                return [
+                    types.TextContent(
+                        type = "text",
+                        text = "Ingested",
+                    )
+                ]
+    elif name == "search":
+        with open(os.devnull, "w") as fnull:
+            with redirect_stdout(fnull), redirect_stderr(fnull):
+                if not arguments:
+                    raise ValueError("Missing arguments")
+
+                search_query = arguments.get("query")
+
                search_results = await cognee.search(
-                    SearchType.INSIGHTS, query_text=search_query
+                    SearchType.INSIGHTS, query_text = search_query
                )

                results = retrieved_edges_to_string(search_results)

                return [
                    types.TextContent(
-                        type="text",
-                        text=results,
+                        type = "text",
+                        text = results,
+                    )
+                ]
+    elif name == "prune":
+        with open(os.devnull, "w") as fnull:
+            with redirect_stdout(fnull), redirect_stderr(fnull):
+                await cognee.prune.prune_data()
+                await cognee.prune.prune_system(metadata=True)
+
+                return [
+                    types.TextContent(
+                        type = "text",
+                        text = "Pruned",
                    )
                ]
    else:
@ -116,11 +166,15 @@ async def main():
            read_stream,
            write_stream,
            InitializationOptions(
-                server_name="mcpcognee",
-                server_version="0.1.0",
-                capabilities=server.get_capabilities(
-                    notification_options=NotificationOptions(),
-                    experimental_capabilities={},
+                server_name = "cognee-mcp",
+                server_version = "0.1.0",
+                capabilities = server.get_capabilities(
+                    notification_options = NotificationOptions(),
+                    experimental_capabilities = {},
                ),
            ),
-        )
+        )
+
+# This is needed if you'd like to connect to a custom client
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/cognee-mcp/mcpcognee/main.py
+++ b/cognee-mcp/mcpcognee/main.py
@ -1,4 +0,0 @@
-from mcpcognee import main
-import asyncio
-
-asyncio.run(main())
--- a/cognee-mcp/pyproject.toml
+++ b/cognee-mcp/pyproject.toml
@ -1,5 +1,5 @@
 [project]
-name = "mcpcognee"
+name = "cognee-mcp"
 version = "0.1.0"
 description = "A MCP server project"
 readme = "README.md"
@ -91,4 +91,4 @@ dev = [
 ]

 [project.scripts]
-mcpcognee = "mcpcognee:main"
+cognee = "cognee_mcp:main"
--- a/cognee-mcp/uv.lock
+++ b/cognee-mcp/uv.lock
--- a/cognee/api/v1/cognify/code_graph_pipeline.py
+++ b/cognee/api/v1/cognify/code_graph_pipeline.py
@ -3,6 +3,8 @@ import logging
 from pathlib import Path

 from cognee.base_config import get_base_config
+from cognee.infrastructure.databases.vector.embeddings import \
+    get_embedding_engine
 from cognee.modules.cognify.config import get_cognify_config
 from cognee.modules.pipelines import run_tasks
 from cognee.modules.pipelines.tasks.Task import Task
@ -15,8 +17,10 @@ from cognee.tasks.ingestion import ingest_data_with_metadata
 from cognee.tasks.repo_processor import (enrich_dependency_graph,
                                         expand_dependency_graph,
                                         get_data_list_for_user,
-                                         get_non_code_files,
+                                         get_non_py_files,
                                         get_repo_file_dependencies)
+from cognee.tasks.repo_processor.get_source_code_chunks import \
+    get_source_code_chunks
 from cognee.tasks.storage import add_data_points

 monitoring = get_base_config().monitoring_tool
@ -28,6 +32,7 @@ from cognee.tasks.summarization import summarize_code, summarize_text
 logger = logging.getLogger("code_graph_pipeline")
 update_status_lock = asyncio.Lock()

+
@observe
 async def run_code_graph_pipeline(repo_path, include_docs=True):
    import os
@ -46,20 +51,23 @@ async def run_code_graph_pipeline(repo_path, include_docs=True):
    await cognee.prune.prune_system(metadata=True)
    await create_db_and_tables()

+    embedding_engine = get_embedding_engine()
+
    cognee_config = get_cognify_config()
    user = await get_default_user()

    tasks = [
        Task(get_repo_file_dependencies),
-        Task(enrich_dependency_graph, task_config={"batch_size": 50}),
+        Task(enrich_dependency_graph),
        Task(expand_dependency_graph, task_config={"batch_size": 50}),
+        Task(get_source_code_chunks, embedding_model=embedding_engine.model, task_config={"batch_size": 50}),
        Task(summarize_code, task_config={"batch_size": 50}),
        Task(add_data_points, task_config={"batch_size": 50}),
    ]

    if include_docs:
        non_code_tasks = [
-            Task(get_non_code_files, task_config={"batch_size": 50}),
+            Task(get_non_py_files, task_config={"batch_size": 50}),
            Task(ingest_data_with_metadata, dataset_name="repo_docs", user=user),
            Task(get_data_list_for_user, dataset_name="repo_docs", user=user),
            Task(classify_documents),
@ -71,7 +79,7 @@ async def run_code_graph_pipeline(repo_path, include_docs=True):
                task_config={"batch_size": 50}
            ),
        ]
-
+        
    if include_docs:
        async for result in run_tasks(non_code_tasks, repo_path):
            yield result
--- a/cognee/infrastructure/databases/exceptions/EmbeddingException.py
+++ b/cognee/infrastructure/databases/exceptions/EmbeddingException.py
@ -0,0 +1,3 @@
+class EmbeddingException(Exception):
+    """Custom exception for handling embedding-related errors."""
+    pass
--- a/cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py
+++ b/cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py
@ -1,15 +1,23 @@
+import os
 from os import path
+import  logging
 from uuid import UUID
 from typing import Optional
 from typing import AsyncGenerator, List
 from contextlib import asynccontextmanager
-from sqlalchemy import text, select, MetaData, Table
+from sqlalchemy import text, select, MetaData, Table, delete
 from sqlalchemy.orm import joinedload
+from sqlalchemy.exc import NoResultFound
 from sqlalchemy.ext.asyncio import AsyncSession, create_async_engine, async_sessionmaker

 from cognee.infrastructure.databases.exceptions import EntityNotFoundError
+from cognee.modules.data.models.Data import Data
+
 from ..ModelBase import Base

+
+logger = logging.getLogger(__name__)
+
 class SQLAlchemyAdapter():
    def __init__(self, connection_string: str):
        self.db_path: str = None
@ -86,9 +94,9 @@ class SQLAlchemyAdapter():
                return [schema[0] for schema in result.fetchall()]
        return []

-    async def delete_data_by_id(self, table_name: str, data_id: UUID, schema_name: Optional[str] = "public"):
+    async def delete_entity_by_id(self, table_name: str, data_id: UUID, schema_name: Optional[str] = "public"):
        """
-        Delete data in given table based on id. Table must have an id Column.
+        Delete entity in given table based on id. Table must have an id Column.
        """
        if self.engine.dialect.name == "sqlite":
            async with self.get_async_session() as session:
@ -107,6 +115,42 @@ class SQLAlchemyAdapter():
                await session.commit()


+    async def delete_data_entity(self, data_id: UUID):
+        """
+        Delete data and local files related to data if there are no references to it anymore.
+        """
+        async with self.get_async_session() as session:
+            if self.engine.dialect.name == "sqlite":
+                # Foreign key constraints are disabled by default in SQLite (for backwards compatibility),
+                # so must be enabled for each database connection/session separately.
+                await session.execute(text("PRAGMA foreign_keys = ON;"))
+
+            try:
+                data_entity = (await session.scalars(select(Data).where(Data.id == data_id))).one()
+            except (ValueError, NoResultFound) as e:
+                raise EntityNotFoundError(message=f"Entity not found: {str(e)}")
+
+            # Check if other data objects point to the same raw data location
+            raw_data_location_entities = (await session.execute(
+                select(Data.raw_data_location).where(Data.raw_data_location == data_entity.raw_data_location))).all()
+
+            # Don't delete local file unless this is the only reference to the file in the database
+            if len(raw_data_location_entities) == 1:
+
+                # delete local file only if it's created by cognee
+                from cognee.base_config import get_base_config
+                config = get_base_config()
+
+                if config.data_root_directory in raw_data_location_entities[0].raw_data_location:
+                    if os.path.exists(raw_data_location_entities[0].raw_data_location):
+                        os.remove(raw_data_location_entities[0].raw_data_location)
+                    else:
+                        # Report bug as file should exist
+                        logger.error("Local file which should exist can't be found.")
+
+            await session.execute(delete(Data).where(Data.id == data_id))
+            await session.commit()
+
    async def get_table(self, table_name: str, schema_name: Optional[str] = "public") -> Table:
        """
        Dynamically loads a table using the given table name and schema name.
--- a/cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py
+++ b/cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py
@ -5,17 +5,19 @@ from typing import List, Optional
 import litellm
 import os
 from cognee.infrastructure.databases.vector.embeddings.EmbeddingEngine import EmbeddingEngine
+from cognee.infrastructure.databases.exceptions.EmbeddingException import EmbeddingException

 litellm.set_verbose = False
 logger = logging.getLogger("LiteLLMEmbeddingEngine")

+
 class LiteLLMEmbeddingEngine(EmbeddingEngine):
    api_key: str
    endpoint: str
    api_version: str
    model: str
    dimensions: int
-    mock:bool
+    mock: bool

    def __init__(
        self,
@ -33,7 +35,7 @@ class LiteLLMEmbeddingEngine(EmbeddingEngine):

        enable_mocking = os.getenv("MOCK_EMBEDDING", "false")
        if isinstance(enable_mocking, bool):
-            enable_mocking= str(enable_mocking).lower()
+            enable_mocking = str(enable_mocking).lower()
        self.mock = enable_mocking in ("true", "1", "yes")

    MAX_RETRIES = 5
@ -43,7 +45,7 @@ class LiteLLMEmbeddingEngine(EmbeddingEngine):
        async def exponential_backoff(attempt):
            wait_time = min(10 * (2 ** attempt), 60)  # Max 60 seconds
            await asyncio.sleep(wait_time)
-      
+
        try:
            if self.mock:
                response = {
@ -56,10 +58,10 @@ class LiteLLMEmbeddingEngine(EmbeddingEngine):
            else:
                response = await litellm.aembedding(
                    self.model,
-                    input = text,
-                    api_key = self.api_key,
-                    api_base = self.endpoint,
-                    api_version = self.api_version
+                    input=text,
+                    api_key=self.api_key,
+                    api_base=self.endpoint,
+                    api_version=self.api_version
                )

                self.retry_count = 0
@ -71,7 +73,7 @@ class LiteLLMEmbeddingEngine(EmbeddingEngine):
                if len(text) == 1:
                    parts = [text]
                else:
-                    parts = [text[0:math.ceil(len(text)/2)], text[math.ceil(len(text)/2):]]
+                    parts = [text[0:math.ceil(len(text) / 2)], text[math.ceil(len(text) / 2):]]

                parts_futures = [self.embed_text(part) for part in parts]
                embeddings = await asyncio.gather(*parts_futures)
@ -95,6 +97,9 @@ class LiteLLMEmbeddingEngine(EmbeddingEngine):

            return await self.embed_text(text)

+        except (litellm.exceptions.BadRequestError, litellm.llms.OpenAI.openai.OpenAIError):
+            raise EmbeddingException("Failed to index data points.")
+
        except Exception as error:
            logger.error("Error embedding text: %s", str(error))
            raise error
--- a/cognee/infrastructure/llm/openai/adapter.py
+++ b/cognee/infrastructure/llm/openai/adapter.py
@ -12,7 +12,8 @@ from cognee.infrastructure.llm.llm_interface import LLMInterface
 from cognee.infrastructure.llm.prompts import read_query_prompt
 from cognee.base_config import get_base_config

-if MonitoringTool.LANGFUSE:
+monitoring = get_base_config().monitoring_tool
+if monitoring == MonitoringTool.LANGFUSE:
    from langfuse.decorators import observe

 class OpenAIAdapter(LLMInterface):
@ -43,7 +44,7 @@ class OpenAIAdapter(LLMInterface):
        base_config = get_base_config()


-    @observe()
+    @observe(as_type='generation')
    async def acreate_structured_output(self, text_input: str, system_prompt: str,
                                        response_model: Type[BaseModel]) -> BaseModel:

--- a/cognee/modules/data/extraction/extract_summary.py
+++ b/cognee/modules/data/extraction/extract_summary.py
@ -1,12 +1,17 @@
-from typing import Type
+import logging
 import os
+from typing import Type
+
+from instructor.exceptions import InstructorRetryException
 from pydantic import BaseModel
+from tenacity import RetryError

 from cognee.infrastructure.llm.get_llm_client import get_llm_client
 from cognee.infrastructure.llm.prompts import read_query_prompt
-from cognee.shared.data_models import SummarizedCode, SummarizedClass, SummarizedFunction
+from cognee.shared.data_models import SummarizedCode
 from cognee.tasks.summarization.mock_summary import get_mock_summarized_code

+logger = logging.getLogger("extract_summary")

 async def extract_summary(content: str, response_model: Type[BaseModel]):
    llm_client = get_llm_client()
@ -14,7 +19,7 @@ async def extract_summary(content: str, response_model: Type[BaseModel]):
    system_prompt = read_query_prompt("summarize_content.txt")

    llm_output = await llm_client.acreate_structured_output(content, system_prompt, response_model)
-
+ 
    return llm_output

 async def extract_code_summary(content: str):
@ -27,5 +32,10 @@ async def extract_code_summary(content: str):
        result = get_mock_summarized_code()
        return result
    else:
-        result = await extract_summary(content, response_model=SummarizedCode)
+        try:
+            result = await extract_summary(content, response_model=SummarizedCode)
+        except (RetryError, InstructorRetryException) as e:
+            logger.error("Failed to extract code summary, falling back to mock summary", exc_info=e)
+            result = get_mock_summarized_code()
+
        return result
--- a/cognee/modules/data/methods/delete_data.py
+++ b/cognee/modules/data/methods/delete_data.py
@ -17,4 +17,4 @@ async def delete_data(data: Data):

    db_engine = get_relational_engine()

-    return await db_engine.delete_data_by_id(data.__tablename__, data.id)
+    return await db_engine.delete_data_entity(data.id)
--- a/cognee/modules/data/methods/delete_dataset.py
+++ b/cognee/modules/data/methods/delete_dataset.py
@ -4,4 +4,4 @@ from cognee.infrastructure.databases.relational import get_relational_engine
 async def delete_dataset(dataset: Dataset):
    db_engine = get_relational_engine()

-    return await db_engine.delete_data_by_id(dataset.__tablename__, dataset.id)
+    return await db_engine.delete_entity_by_id(dataset.__tablename__, dataset.id)
--- a/cognee/shared/CodeGraphEntities.py
+++ b/cognee/shared/CodeGraphEntities.py
@ -1,5 +1,4 @@
 from typing import List, Optional
-
 from cognee.infrastructure.engine import DataPoint


@ -7,7 +6,7 @@ class Repository(DataPoint):
    __tablename__ = "Repository"
    path: str
    _metadata: dict = {
-        "index_fields": ["source_code"],
+        "index_fields": [],
        "type": "Repository"
    }

@ -19,29 +18,31 @@ class CodeFile(DataPoint):
    depends_on: Optional[List["CodeFile"]] = None
    depends_directly_on: Optional[List["CodeFile"]] = None
    contains: Optional[List["CodePart"]] = None
-
    _metadata: dict = {
-        "index_fields": ["source_code"],
+        "index_fields": [],
        "type": "CodeFile"
    }

 class CodePart(DataPoint):
    __tablename__ = "codepart"
-    # part_of: Optional[CodeFile]
-    source_code: str
-    
+    # part_of: Optional[CodeFile] = None
+    source_code: Optional[str] = None
    _metadata: dict = {
-        "index_fields": ["source_code"],
+        "index_fields": [],
        "type": "CodePart"
    }

-class CodeRelationship(DataPoint):
-    source_id: str
-    target_id: str
-    relation: str  # depends on or depends directly
+class SourceCodeChunk(DataPoint):
+    __tablename__ = "sourcecodechunk"
+    code_chunk_of: Optional[CodePart] = None
+    source_code: Optional[str] = None
+    previous_chunk: Optional["SourceCodeChunk"] = None
+
    _metadata: dict = {
-        "type": "CodeRelationship"
+        "index_fields": ["source_code"],
+        "type": "SourceCodeChunk"
    }

 CodeFile.model_rebuild()
 CodePart.model_rebuild()
+SourceCodeChunk.model_rebuild()
--- a/cognee/shared/data_models.py
+++ b/cognee/shared/data_models.py
@ -210,7 +210,6 @@ class SummarizedClass(BaseModel):
    decorators: Optional[List[str]] = None

 class SummarizedCode(BaseModel):
-    file_name: str
    high_level_summary: str
    key_features: List[str]
    imports: List[str] = []
--- a/cognee/tasks/repo_processor/get_repo_file_dependencies.py
+++ b/cognee/tasks/repo_processor/get_repo_file_dependencies.py
@ -71,7 +71,7 @@ async def get_repo_file_dependencies(repo_path: str) -> AsyncGenerator[list, Non
        path = repo_path,
    )

-    yield repo
+    yield [repo]

    with ProcessPoolExecutor(max_workers = 12) as executor:
        loop = asyncio.get_event_loop()
@ -90,10 +90,11 @@ async def get_repo_file_dependencies(repo_path: str) -> AsyncGenerator[list, Non

        results = await asyncio.gather(*tasks)

+        code_files = []
        for (file_path, metadata), dependencies in zip(py_files_dict.items(), results):
            source_code = metadata.get("source_code")

-            yield CodeFile(
+            code_files.append(CodeFile(
                id = uuid5(NAMESPACE_OID, file_path),
                source_code = source_code,
                extracted_id = file_path,
@ -106,4 +107,6 @@ async def get_repo_file_dependencies(repo_path: str) -> AsyncGenerator[list, Non
                        source_code = py_files_dict.get(dependency, {}).get("source_code"),
                    ) for dependency in dependencies
                ] if dependencies else None,
-            )
+            ))
+
+        yield code_files
--- a/cognee/tasks/repo_processor/get_source_code_chunks.py
+++ b/cognee/tasks/repo_processor/get_source_code_chunks.py
@ -0,0 +1,164 @@
+import logging
+from typing import AsyncGenerator, Generator
+from uuid import NAMESPACE_OID, uuid5
+
+import parso
+import tiktoken
+
+from cognee.infrastructure.engine import DataPoint
+from cognee.shared.CodeGraphEntities import CodeFile, CodePart, SourceCodeChunk
+
+logger = logging.getLogger("task:get_source_code_chunks")
+
+
+def _count_tokens(tokenizer: tiktoken.Encoding, source_code: str) -> int:
+    return len(tokenizer.encode(source_code))
+
+
+def _get_naive_subchunk_token_counts(
+        tokenizer: tiktoken.Encoding, source_code: str, max_subchunk_tokens: int = 8000
+) -> list[tuple[str, int]]:
+    """Splits source code into subchunks of up to max_subchunk_tokens and counts tokens."""
+
+    token_ids = tokenizer.encode(source_code)
+    subchunk_token_counts = []
+
+    for start_idx in range(0, len(token_ids), max_subchunk_tokens):
+        subchunk_token_ids = token_ids[start_idx: start_idx + max_subchunk_tokens]
+        token_count = len(subchunk_token_ids)
+        subchunk = ''.join(
+            tokenizer.decode_single_token_bytes(token_id).decode('utf-8', errors='replace')
+            for token_id in subchunk_token_ids
+        )
+        subchunk_token_counts.append((subchunk, token_count))
+
+    return subchunk_token_counts
+
+
+def _get_subchunk_token_counts(
+        tokenizer: tiktoken.Encoding,
+        source_code: str,
+        max_subchunk_tokens: int = 8000,
+        depth: int = 0,
+        max_depth: int = 100
+) -> list[tuple[str, int]]:
+    """Splits source code into subchunk and counts tokens for each subchunk."""
+    if depth > max_depth:
+        return _get_naive_subchunk_token_counts(tokenizer, source_code, max_subchunk_tokens)
+
+    try:
+        module = parso.parse(source_code)
+    except Exception as e:
+        logger.error(f"Error parsing source code: {e}")
+        return []
+
+    if not module.children:
+        logger.warning("Parsed module has no children (empty or invalid source code).")
+        return []
+
+    # Handle cases with only one real child and an EndMarker to prevent infinite recursion.
+    if len(module.children) <= 2:
+        module = module.children[0]
+
+    subchunk_token_counts = []
+    for child in module.children:
+        subchunk = child.get_code()
+        token_count = _count_tokens(tokenizer, subchunk)
+
+        if token_count == 0:
+            continue
+
+        if token_count <= max_subchunk_tokens:
+            subchunk_token_counts.append((subchunk, token_count))
+            continue
+
+        if child.type == 'string':
+            subchunk_token_counts.extend(_get_naive_subchunk_token_counts(tokenizer, subchunk, max_subchunk_tokens))
+            continue
+
+        subchunk_token_counts.extend(
+            _get_subchunk_token_counts(tokenizer, subchunk, max_subchunk_tokens, depth=depth + 1, max_depth=max_depth)
+        )
+
+    return subchunk_token_counts
+
+
+def _get_chunk_source_code(
+        code_token_counts: list[tuple[str, int]], overlap: float, max_tokens: int
+) -> tuple[list[tuple[str, int]], str]:
+    """Generates a chunk of source code from tokenized subchunks with overlap handling."""
+    current_count = 0
+    cumulative_counts = []
+    current_source_code = ''
+
+    for i, (child_code, token_count) in enumerate(code_token_counts):
+        current_count += token_count
+        cumulative_counts.append(current_count)
+        if current_count > max_tokens:
+            break
+        current_source_code += f"\n{child_code}"
+
+    if current_count <= max_tokens:
+        return [], current_source_code.strip()
+
+    cutoff = 1
+    for i, cum_count in enumerate(cumulative_counts):
+        if cum_count > (1 - overlap) * max_tokens:
+            break
+        cutoff = i
+
+    return code_token_counts[cutoff:], current_source_code.strip()
+
+
+def get_source_code_chunks_from_code_part(
+        code_file_part: CodePart,
+        max_tokens: int = 8192,
+        overlap: float = 0.25,
+        granularity: float = 0.1,
+        model_name: str = "text-embedding-3-large"
+) -> Generator[SourceCodeChunk, None, None]:
+    """Yields source code chunks from a CodePart object, with configurable token limits and overlap."""
+    if not code_file_part.source_code:
+        logger.error(f"No source code in CodeFile {code_file_part.id}")
+        return
+
+    tokenizer = tiktoken.encoding_for_model(model_name)
+    max_subchunk_tokens = max(1, int(granularity * max_tokens))
+    subchunk_token_counts = _get_subchunk_token_counts(tokenizer, code_file_part.source_code, max_subchunk_tokens)
+
+    previous_chunk = None
+    while subchunk_token_counts:
+        subchunk_token_counts, chunk_source_code = _get_chunk_source_code(subchunk_token_counts, overlap, max_tokens)
+        if not chunk_source_code:
+            continue
+        current_chunk = SourceCodeChunk(
+            id=uuid5(NAMESPACE_OID, chunk_source_code),
+            code_chunk_of=code_file_part,
+            source_code=chunk_source_code,
+            previous_chunk=previous_chunk
+        )
+        yield current_chunk
+        previous_chunk = current_chunk
+
+
+async def get_source_code_chunks(data_points: list[DataPoint], embedding_model="text-embedding-3-large") -> \
+        AsyncGenerator[list[DataPoint], None]:
+    """Processes code graph datapoints, create SourceCodeChink datapoints."""
+    # TODO: Add support for other embedding models, with max_token mapping
+    for data_point in data_points:
+        try:
+            yield data_point
+            if not isinstance(data_point, CodeFile):
+                continue
+            if not data_point.contains:
+                logger.warning(f"CodeFile {data_point.id} contains no code parts")
+                continue
+            for code_part in data_point.contains:
+                try:
+                    yield code_part
+                    for source_code_chunk in get_source_code_chunks_from_code_part(code_part, model_name=embedding_model):
+                        yield source_code_chunk
+                except Exception as e:
+                    logger.error(f"Error processing code part: {e}")
+        except Exception as e:
+            logger.error(f"Error processing data point: {e}")
--- a/cognee/tasks/storage/index_data_points.py
+++ b/cognee/tasks/storage/index_data_points.py
@ -1,6 +1,10 @@
+import logging
+
+from cognee.infrastructure.databases.exceptions.EmbeddingException import EmbeddingException
 from cognee.infrastructure.databases.vector import get_vector_engine
 from cognee.infrastructure.engine import DataPoint

+logger = logging.getLogger("index_data_points")

 async def index_data_points(data_points: list[DataPoint]):
    created_indexes = {}
@ -30,7 +34,10 @@ async def index_data_points(data_points: list[DataPoint]):

    for index_name, indexable_points in index_points.items():
        index_name, field_name = index_name.split(".")
-        await vector_engine.index_data_points(index_name, field_name, indexable_points)
+        try:
+            await vector_engine.index_data_points(index_name, field_name, indexable_points)
+        except EmbeddingException as e:
+            logger.warning(f"Failed to index data points for {index_name}.{field_name}: {e}")

    return data_points

--- a/cognee/tasks/summarization/models.py
+++ b/cognee/tasks/summarization/models.py
@ -1,6 +1,8 @@
+from typing import Union
+
 from cognee.infrastructure.engine import DataPoint
 from cognee.modules.chunking.models import DocumentChunk
-from cognee.shared.CodeGraphEntities import CodeFile
+from cognee.shared.CodeGraphEntities import CodeFile, CodePart, SourceCodeChunk


 class TextSummary(DataPoint):
@ -17,7 +19,7 @@ class TextSummary(DataPoint):
 class CodeSummary(DataPoint):
    __tablename__ = "code_summary"
    text: str
-    made_from: CodeFile
+    summarizes: Union[CodeFile, CodePart, SourceCodeChunk]

    _metadata: dict = {
        "index_fields": ["text"],
--- a/cognee/tasks/summarization/summarize_code.py
+++ b/cognee/tasks/summarization/summarize_code.py
@ -1,10 +1,10 @@
 import asyncio
 from typing import AsyncGenerator, Union
 from uuid import uuid5
-from typing import Type

 from cognee.infrastructure.engine import DataPoint
 from cognee.modules.data.extraction.extract_summary import extract_code_summary
+
 from .models import CodeSummary


@ -21,7 +21,7 @@ async def summarize_code(
    )

    file_summaries_map = {
-        code_data_point.extracted_id: str(file_summary)
+        code_data_point.id: str(file_summary)
        for code_data_point, file_summary in zip(code_data_points, file_summaries)
    }

@ -35,6 +35,6 @@ async def summarize_code(

        yield CodeSummary(
            id=uuid5(node.id, "CodeSummary"),
-            made_from=node,
-            text=file_summaries_map[node.extracted_id],
+            summarizes=node,
+            text=file_summaries_map[node.id],
        )
--- a/cognee/tests/test_pgvector.py
+++ b/cognee/tests/test_pgvector.py
@ -2,12 +2,53 @@ import os
 import logging
 import pathlib
 import cognee
+
+from cognee.modules.data.models import Data
 from cognee.api.v1.search import SearchType
 from cognee.modules.retrieval.brute_force_triplet_search import brute_force_triplet_search
 from cognee.modules.users.methods import get_default_user

 logging.basicConfig(level=logging.DEBUG)

+async def test_local_file_deletion(data_text, file_location):
+    from sqlalchemy import select
+    import hashlib
+    from cognee.infrastructure.databases.relational import get_relational_engine
+
+    engine = get_relational_engine()
+
+    async with engine.get_async_session() as session:
+        # Get hash of data contents
+        encoded_text = data_text.encode("utf-8")
+        data_hash = hashlib.md5(encoded_text).hexdigest()
+        # Get data entry from database based on hash contents
+        data = (await session.scalars(select(Data).where(Data.content_hash == data_hash))).one()
+        assert os.path.isfile(data.raw_data_location), f"Data location doesn't exist: {data.raw_data_location}"
+        # Test deletion of data along with local files created by cognee
+        await engine.delete_data_entity(data.id)
+        assert not os.path.exists(
+            data.raw_data_location), f"Data location still exists after deletion: {data.raw_data_location}"
+
+    async with engine.get_async_session() as session:
+        # Get data entry from database based on file path
+        data = (await session.scalars(select(Data).where(Data.raw_data_location == file_location))).one()
+        assert os.path.isfile(data.raw_data_location), f"Data location doesn't exist: {data.raw_data_location}"
+        # Test local files not created by cognee won't get deleted
+        await engine.delete_data_entity(data.id)
+        assert os.path.exists(data.raw_data_location), f"Data location doesn't exists: {data.raw_data_location}"
+
+async def test_getting_of_documents(dataset_name_1):
+    # Test getting of documents for search per dataset
+    from cognee.modules.users.permissions.methods import get_document_ids_for_user
+    user = await get_default_user()
+    document_ids = await get_document_ids_for_user(user.id, [dataset_name_1])
+    assert len(document_ids) == 1, f"Number of expected documents doesn't match {len(document_ids)} != 1"
+
+    # Test getting of documents for search when no dataset is provided
+    user = await get_default_user()
+    document_ids = await get_document_ids_for_user(user.id)
+    assert len(document_ids) == 2, f"Number of expected documents doesn't match {len(document_ids)} != 2"
+

 async def main():
    cognee.config.set_vector_db_config(
@ -67,16 +108,7 @@ async def main():

    from cognee.infrastructure.databases.vector import get_vector_engine

-    # Test getting of documents for search per dataset
-    from cognee.modules.users.permissions.methods import get_document_ids_for_user
-    user = await get_default_user()
-    document_ids = await get_document_ids_for_user(user.id, [dataset_name_1])
-    assert len(document_ids) == 1, f"Number of expected documents doesn't match {len(document_ids)} != 1"
-
-    # Test getting of documents for search when no dataset is provided
-    user = await get_default_user()
-    document_ids = await get_document_ids_for_user(user.id)
-    assert len(document_ids) == 2, f"Number of expected documents doesn't match {len(document_ids)} != 2"
+    await test_getting_of_documents(dataset_name_1)

    vector_engine = get_vector_engine()
    random_node = (await vector_engine.search("entity_name", "Quantum computer"))[0]
@ -106,6 +138,8 @@ async def main():
    results = await brute_force_triplet_search('What is a quantum computer?')
    assert len(results) > 0

+    await test_local_file_deletion(text, explanation_file_path)
+
    await cognee.prune.prune_data()
    assert not os.path.isdir(data_directory_path), "Local data files are not deleted"

--- a/examples/python/code_graph_example.py
+++ b/examples/python/code_graph_example.py
@ -11,6 +11,6 @@ async def main(repo_path, include_docs):
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--repo_path", type=str, required=True, help="Path to the repository")
-    parser.add_argument("--include_docs", type=bool, default=True, help="Whether or not to process non-code files")
+    parser.add_argument("--include_docs", type=lambda x: x.lower() in ("true", "1"), default=True, help="Whether or not to process non-code files")
    args = parser.parse_args()
    asyncio.run(main(args.repo_path, args.include_docs))
--- a/poetry.lock
+++ b/poetry.lock
@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.8.5 and should not be changed by hand.

 [[package]]
 name = "aiofiles"
@ -586,17 +586,17 @@ css = ["tinycss2 (>=1.1.0,<1.5)"]

 [[package]]
 name = "boto3"
-version = "1.35.84"
+version = "1.35.85"
 description = "The AWS SDK for Python"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "boto3-1.35.84-py3-none-any.whl", hash = "sha256:c94fc8023caf952f8740a48fc400521bba167f883cfa547d985c05fda7223f7a"},
-    {file = "boto3-1.35.84.tar.gz", hash = "sha256:9f9bf72d92f7fdd546b974ffa45fa6715b9af7f5c00463e9d0f6ef9c95efe0c2"},
+    {file = "boto3-1.35.85-py3-none-any.whl", hash = "sha256:f22678bdbdc91ca6022a45696284d236e1fbafa84ca3a69d108d4a155cdd823e"},
+    {file = "boto3-1.35.85.tar.gz", hash = "sha256:6257cad97d92c2b5597aec6e5484b9cfed8c0c785297942ed37cfaf2dd0ec23c"},
 ]

 [package.dependencies]
-botocore = ">=1.35.84,<1.36.0"
+botocore = ">=1.35.85,<1.36.0"
 jmespath = ">=0.7.1,<2.0.0"
 s3transfer = ">=0.10.0,<0.11.0"

@ -605,13 +605,13 @@ crt = ["botocore[crt] (>=1.21.0,<2.0a0)"]

 [[package]]
 name = "botocore"
-version = "1.35.84"
+version = "1.35.85"
 description = "Low-level, data-driven core of boto 3."
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "botocore-1.35.84-py3-none-any.whl", hash = "sha256:b4dc2ac7f54ba959429e1debbd6c7c2fb2349baa1cd63803f0682f0773dbd077"},
-    {file = "botocore-1.35.84.tar.gz", hash = "sha256:f86754882e04683e2e99a6a23377d0dd7f1fc2b2242844b2381dbe4dcd639301"},
+    {file = "botocore-1.35.85-py3-none-any.whl", hash = "sha256:04c196905b0eebcb29f7594a9e4588772a5222deed1b381f54cab78d0f30e239"},
+    {file = "botocore-1.35.85.tar.gz", hash = "sha256:5e7e8075e85427c9e0e6d15dcb7d13b3c843011b25d43981571fe1bfb3fd6985"},
 ]

 [package.dependencies]
@ -726,6 +726,17 @@ files = [
 [package.dependencies]
 pycparser = "*"

+[[package]]
+name = "cfgv"
+version = "3.4.0"
+description = "Validate configuration and produce human readable error messages."
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "cfgv-3.4.0-py2.py3-none-any.whl", hash = "sha256:b7265b1f29fd3316bfcd2b330d63d024f2bfd8bcb8b0272f8e19a504856c48f9"},
+    {file = "cfgv-3.4.0.tar.gz", hash = "sha256:e52591d4c5f5dead8e0f673fb16db7949d2cfb3f7da4582893288f0ded8fe560"},
+]
+
 [[package]]
 name = "chardet"
 version = "5.2.0"
@ -1248,13 +1259,13 @@ optimize = ["orjson"]

 [[package]]
 name = "deepeval"
-version = "2.0.6"
-description = "The open-source LLMs evaluation framework."
+version = "2.0.8"
+description = "The Open-Source LLM Evaluation Framework."
 optional = true
 python-versions = "<3.13,>=3.9"
 files = [
-    {file = "deepeval-2.0.6-py3-none-any.whl", hash = "sha256:57302830ff9d3d16ad4f1961338c7b4453e48039ff131990f258880728f33b6b"},
-    {file = "deepeval-2.0.6.tar.gz", hash = "sha256:74976e01f4896a18169354a766a779b99df3d3386f88d9a9e49862fa93989dd5"},
+    {file = "deepeval-2.0.8-py3-none-any.whl", hash = "sha256:a947f7440f168e734b3b433b6b2c56c512757cfcc403aba99c6393a45a15f776"},
+    {file = "deepeval-2.0.8.tar.gz", hash = "sha256:a0222f93f9a50d51b9962d88d16c2fc315e8223b756355e65eb9a1d5e1a8ae40"},
 ]

 [package.dependencies]
@ -1380,6 +1391,17 @@ files = [
    {file = "dirtyjson-1.0.8.tar.gz", hash = "sha256:90ca4a18f3ff30ce849d100dcf4a003953c79d3a2348ef056f1d9c22231a25fd"},
 ]

+[[package]]
+name = "distlib"
+version = "0.3.9"
+description = "Distribution utilities"
+optional = false
+python-versions = "*"
+files = [
+    {file = "distlib-0.3.9-py2.py3-none-any.whl", hash = "sha256:47f8c22fd27c27e25a65601af709b38e4f0a45ea4fc2e710f65755fa8caaaf87"},
+    {file = "distlib-0.3.9.tar.gz", hash = "sha256:a60f20dea646b8a33f3e7772f74dc0b2d0772d2837ee1342a00645c81edf9403"},
+]
+
 [[package]]
 name = "distro"
 version = "1.9.0"
@ -2529,6 +2551,20 @@ files = [
    {file = "hyperframe-6.0.1.tar.gz", hash = "sha256:ae510046231dc8e9ecb1a6586f63d2347bf4c8905914aa84ba585ae85f28a914"},
 ]

+[[package]]
+name = "identify"
+version = "2.6.3"
+description = "File identification library for Python"
+optional = false
+python-versions = ">=3.9"
+files = [
+    {file = "identify-2.6.3-py2.py3-none-any.whl", hash = "sha256:9edba65473324c2ea9684b1f944fe3191db3345e50b6d04571d10ed164f8d7bd"},
+    {file = "identify-2.6.3.tar.gz", hash = "sha256:62f5dae9b5fef52c84cc188514e9ea4f3f636b1d8799ab5ebc475471f9e47a02"},
+]
+
+[package.extras]
+license = ["ukkonen"]
+
 [[package]]
 name = "idna"
 version = "3.10"
@ -3055,13 +3091,13 @@ jupyter-server = ">=1.1.2"

 [[package]]
 name = "jupyter-server"
-version = "2.14.2"
+version = "2.15.0"
 description = "The backend—i.e. core services, APIs, and REST endpoints—to Jupyter web applications."
 optional = true
-python-versions = ">=3.8"
+python-versions = ">=3.9"
 files = [
-    {file = "jupyter_server-2.14.2-py3-none-any.whl", hash = "sha256:47ff506127c2f7851a17bf4713434208fc490955d0e8632e95014a9a9afbeefd"},
-    {file = "jupyter_server-2.14.2.tar.gz", hash = "sha256:66095021aa9638ced276c248b1d81862e4c50f292d575920bbe960de1c56b12b"},
+    {file = "jupyter_server-2.15.0-py3-none-any.whl", hash = "sha256:872d989becf83517012ee669f09604aa4a28097c0bd90b2f424310156c2cdae3"},
+    {file = "jupyter_server-2.15.0.tar.gz", hash = "sha256:9d446b8697b4f7337a1b7cdcac40778babdd93ba614b6d68ab1c0c918f1c4084"},
 ]

 [package.dependencies]
@ -3070,7 +3106,7 @@ argon2-cffi = ">=21.1"
 jinja2 = ">=3.0.3"
 jupyter-client = ">=7.4.4"
 jupyter-core = ">=4.12,<5.0.dev0 || >=5.1.dev0"
-jupyter-events = ">=0.9.0"
+jupyter-events = ">=0.11.0"
 jupyter-server-terminals = ">=0.4.4"
 nbconvert = ">=6.4.4"
 nbformat = ">=5.3.0"
@ -3388,13 +3424,13 @@ tenacity = ">=8.1.0,<8.4.0 || >8.4.0,<10"

 [[package]]
 name = "langchain-core"
-version = "0.3.27"
+version = "0.3.28"
 description = "Building applications with LLMs through composability"
 optional = true
 python-versions = "<4.0,>=3.9"
 files = [
-    {file = "langchain_core-0.3.27-py3-none-any.whl", hash = "sha256:5db42cd73f4f2d5d2550403a4c8b7dec2cd56f1501fe207d009b5b9632a5ca02"},
-    {file = "langchain_core-0.3.27.tar.gz", hash = "sha256:3d71835c4c630db03a19757565046029e89ee96a4b06788892599d49bb3fc58a"},
+    {file = "langchain_core-0.3.28-py3-none-any.whl", hash = "sha256:a02f81ca53a8eed757133797e5a602ca80c1324bbecb0c5d86ef7bd3d6625372"},
+    {file = "langchain_core-0.3.28.tar.gz", hash = "sha256:407f7607e6b3c0ebfd6094da95d39b701e22e59966698ef126799782953e7f2c"},
 ]

 [package.dependencies]
@ -4537,6 +4573,17 @@ plot = ["matplotlib"]
 tgrep = ["pyparsing"]
 twitter = ["twython"]

+[[package]]
+name = "nodeenv"
+version = "1.9.1"
+description = "Node.js virtual environment builder"
+optional = false
+python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
+files = [
+    {file = "nodeenv-1.9.1-py2.py3-none-any.whl", hash = "sha256:ba11c9782d29c27c70ffbdda2d7415098754709be8a7056d79a737cd901155c9"},
+    {file = "nodeenv-1.9.1.tar.gz", hash = "sha256:6ec12890a2dab7946721edbfbcd91f3319c6ccc9aec47be7c7e6b7011ee6645f"},
+]
+
 [[package]]
 name = "notebook"
 version = "7.3.1"
@ -4964,8 +5011,8 @@ files = [
 [package.dependencies]
 numpy = [
    {version = ">=1.20.3", markers = "python_version < \"3.10\""},
-    {version = ">=1.21.0", markers = "python_version >= \"3.10\" and python_version < \"3.11\""},
    {version = ">=1.23.2", markers = "python_version >= \"3.11\""},
+    {version = ">=1.21.0", markers = "python_version >= \"3.10\" and python_version < \"3.11\""},
 ]
 python-dateutil = ">=2.8.2"
 pytz = ">=2020.1"
@ -5350,6 +5397,24 @@ dev = ["black", "flake8", "flake8-print", "isort", "pre-commit"]
 sentry = ["django", "sentry-sdk"]
 test = ["coverage", "django", "flake8", "freezegun (==0.3.15)", "mock (>=2.0.0)", "pylint", "pytest", "pytest-timeout"]

+[[package]]
+name = "pre-commit"
+version = "4.0.1"
+description = "A framework for managing and maintaining multi-language pre-commit hooks."
+optional = false
+python-versions = ">=3.9"
+files = [
+    {file = "pre_commit-4.0.1-py2.py3-none-any.whl", hash = "sha256:efde913840816312445dc98787724647c65473daefe420785f885e8ed9a06878"},
+    {file = "pre_commit-4.0.1.tar.gz", hash = "sha256:80905ac375958c0444c65e9cebebd948b3cdb518f335a091a670a89d652139d2"},
+]
+
+[package.dependencies]
+cfgv = ">=2.0.0"
+identify = ">=1.0.0"
+nodeenv = ">=0.11.1"
+pyyaml = ">=5.1"
+virtualenv = ">=20.10.0"
+
 [[package]]
 name = "prometheus-client"
 version = "0.21.1"
@ -5491,32 +5556,32 @@ files = [

 [[package]]
 name = "psutil"
-version = "6.1.0"
+version = "6.1.1"
 description = "Cross-platform lib for process and system monitoring in Python."
 optional = true
 python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7"
 files = [
-    {file = "psutil-6.1.0-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:ff34df86226c0227c52f38b919213157588a678d049688eded74c76c8ba4a5d0"},
-    {file = "psutil-6.1.0-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:c0e0c00aa18ca2d3b2b991643b799a15fc8f0563d2ebb6040f64ce8dc027b942"},
-    {file = "psutil-6.1.0-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:000d1d1ebd634b4efb383f4034437384e44a6d455260aaee2eca1e9c1b55f047"},
-    {file = "psutil-6.1.0-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:5cd2bcdc75b452ba2e10f0e8ecc0b57b827dd5d7aaffbc6821b2a9a242823a76"},
-    {file = "psutil-6.1.0-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:045f00a43c737f960d273a83973b2511430d61f283a44c96bf13a6e829ba8fdc"},
-    {file = "psutil-6.1.0-cp27-none-win32.whl", hash = "sha256:9118f27452b70bb1d9ab3198c1f626c2499384935aaf55388211ad982611407e"},
-    {file = "psutil-6.1.0-cp27-none-win_amd64.whl", hash = "sha256:a8506f6119cff7015678e2bce904a4da21025cc70ad283a53b099e7620061d85"},
-    {file = "psutil-6.1.0-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:6e2dcd475ce8b80522e51d923d10c7871e45f20918e027ab682f94f1c6351688"},
-    {file = "psutil-6.1.0-cp36-abi3-macosx_11_0_arm64.whl", hash = "sha256:0895b8414afafc526712c498bd9de2b063deaac4021a3b3c34566283464aff8e"},
-    {file = "psutil-6.1.0-cp36-abi3-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9dcbfce5d89f1d1f2546a2090f4fcf87c7f669d1d90aacb7d7582addece9fb38"},
-    {file = "psutil-6.1.0-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:498c6979f9c6637ebc3a73b3f87f9eb1ec24e1ce53a7c5173b8508981614a90b"},
-    {file = "psutil-6.1.0-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d905186d647b16755a800e7263d43df08b790d709d575105d419f8b6ef65423a"},
-    {file = "psutil-6.1.0-cp36-cp36m-win32.whl", hash = "sha256:6d3fbbc8d23fcdcb500d2c9f94e07b1342df8ed71b948a2649b5cb060a7c94ca"},
-    {file = "psutil-6.1.0-cp36-cp36m-win_amd64.whl", hash = "sha256:1209036fbd0421afde505a4879dee3b2fd7b1e14fee81c0069807adcbbcca747"},
-    {file = "psutil-6.1.0-cp37-abi3-win32.whl", hash = "sha256:1ad45a1f5d0b608253b11508f80940985d1d0c8f6111b5cb637533a0e6ddc13e"},
-    {file = "psutil-6.1.0-cp37-abi3-win_amd64.whl", hash = "sha256:a8fb3752b491d246034fa4d279ff076501588ce8cbcdbb62c32fd7a377d996be"},
-    {file = "psutil-6.1.0.tar.gz", hash = "sha256:353815f59a7f64cdaca1c0307ee13558a0512f6db064e92fe833784f08539c7a"},
+    {file = "psutil-6.1.1-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:9ccc4316f24409159897799b83004cb1e24f9819b0dcf9c0b68bdcb6cefee6a8"},
+    {file = "psutil-6.1.1-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:ca9609c77ea3b8481ab005da74ed894035936223422dc591d6772b147421f777"},
+    {file = "psutil-6.1.1-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:8df0178ba8a9e5bc84fed9cfa61d54601b371fbec5c8eebad27575f1e105c0d4"},
+    {file = "psutil-6.1.1-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:1924e659d6c19c647e763e78670a05dbb7feaf44a0e9c94bf9e14dfc6ba50468"},
+    {file = "psutil-6.1.1-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:018aeae2af92d943fdf1da6b58665124897cfc94faa2ca92098838f83e1b1bca"},
+    {file = "psutil-6.1.1-cp27-none-win32.whl", hash = "sha256:6d4281f5bbca041e2292be3380ec56a9413b790579b8e593b1784499d0005dac"},
+    {file = "psutil-6.1.1-cp27-none-win_amd64.whl", hash = "sha256:c777eb75bb33c47377c9af68f30e9f11bc78e0f07fbf907be4a5d70b2fe5f030"},
+    {file = "psutil-6.1.1-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:fc0ed7fe2231a444fc219b9c42d0376e0a9a1a72f16c5cfa0f68d19f1a0663e8"},
+    {file = "psutil-6.1.1-cp36-abi3-macosx_11_0_arm64.whl", hash = "sha256:0bdd4eab935276290ad3cb718e9809412895ca6b5b334f5a9111ee6d9aff9377"},
+    {file = "psutil-6.1.1-cp36-abi3-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b6e06c20c05fe95a3d7302d74e7097756d4ba1247975ad6905441ae1b5b66003"},
+    {file = "psutil-6.1.1-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:97f7cb9921fbec4904f522d972f0c0e1f4fabbdd4e0287813b21215074a0f160"},
+    {file = "psutil-6.1.1-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:33431e84fee02bc84ea36d9e2c4a6d395d479c9dd9bba2376c1f6ee8f3a4e0b3"},
+    {file = "psutil-6.1.1-cp36-cp36m-win32.whl", hash = "sha256:384636b1a64b47814437d1173be1427a7c83681b17a450bfc309a1953e329603"},
+    {file = "psutil-6.1.1-cp36-cp36m-win_amd64.whl", hash = "sha256:8be07491f6ebe1a693f17d4f11e69d0dc1811fa082736500f649f79df7735303"},
+    {file = "psutil-6.1.1-cp37-abi3-win32.whl", hash = "sha256:eaa912e0b11848c4d9279a93d7e2783df352b082f40111e078388701fd479e53"},
+    {file = "psutil-6.1.1-cp37-abi3-win_amd64.whl", hash = "sha256:f35cfccb065fff93529d2afb4a2e89e363fe63ca1e4a5da22b603a85833c2649"},
+    {file = "psutil-6.1.1.tar.gz", hash = "sha256:cf8496728c18f2d0b45198f06895be52f36611711746b7f30c464b422b50e2f5"},
 ]

 [package.extras]
-dev = ["black", "check-manifest", "coverage", "packaging", "pylint", "pyperf", "pypinfo", "pytest-cov", "requests", "rstcheck", "ruff", "sphinx", "sphinx_rtd_theme", "toml-sort", "twine", "virtualenv", "wheel"]
+dev = ["abi3audit", "black", "check-manifest", "coverage", "packaging", "pylint", "pyperf", "pypinfo", "pytest-cov", "requests", "rstcheck", "ruff", "sphinx", "sphinx_rtd_theme", "toml-sort", "twine", "virtualenv", "vulture", "wheel"]
 test = ["pytest", "pytest-xdist", "setuptools"]

 [[package]]
@ -8290,6 +8355,26 @@ files = [
 [package.extras]
 crypto-eth-addresses = ["eth-hash[pycryptodome] (>=0.7.0)"]

+[[package]]
+name = "virtualenv"
+version = "20.28.0"
+description = "Virtual Python Environment builder"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "virtualenv-20.28.0-py3-none-any.whl", hash = "sha256:23eae1b4516ecd610481eda647f3a7c09aea295055337331bb4e6892ecce47b0"},
+    {file = "virtualenv-20.28.0.tar.gz", hash = "sha256:2c9c3262bb8e7b87ea801d715fae4495e6032450c71d2309be9550e7364049aa"},
+]
+
+[package.dependencies]
+distlib = ">=0.3.7,<1"
+filelock = ">=3.12.2,<4"
+platformdirs = ">=3.9.1,<5"
+
+[package.extras]
+docs = ["furo (>=2023.7.26)", "proselint (>=0.13)", "sphinx (>=7.1.2,!=7.3)", "sphinx-argparse (>=0.4)", "sphinxcontrib-towncrier (>=0.2.1a0)", "towncrier (>=23.6)"]
+test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess (>=1)", "flaky (>=3.7)", "packaging (>=23.1)", "pytest (>=7.4)", "pytest-env (>=0.8.2)", "pytest-freezer (>=0.4.8)", "pytest-mock (>=3.11.1)", "pytest-randomly (>=3.12)", "pytest-timeout (>=2.1)", "setuptools (>=68)", "time-machine (>=2.10)"]
+
 [[package]]
 name = "watchdog"
 version = "6.0.0"
@ -8792,4 +8877,4 @@ weaviate = ["weaviate-client"]
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.9.0,<3.12"
-content-hash = "ae111e1b3e0cf0dfdd1b0d124ce265b600348adeca512a5e1d34ebd22066496c"
+content-hash = "042e6fedc069e74ca4ea98cf4614bec99be59a90a29340520d189bf150fd2194"
--- a/pyproject.toml
+++ b/pyproject.toml
@ -74,6 +74,7 @@ deepeval = {version = "^2.0.1", optional = true}
 transformers = "^4.46.3"
 pymilvus = {version = "^2.5.0", optional = true}
 unstructured = { extras = ["csv", "doc", "docx", "epub", "md", "odt", "org", "ppt", "pptx", "rst", "rtf", "tsv", "xlsx"], version = "^0.16.10", optional = true }
+pre-commit = "^4.0.1"
 httpx = "0.27.0"


@ -115,11 +116,10 @@ mkdocstrings = {extras = ["python"], version = "^0.26.2"}

 [tool.ruff] # https://beta.ruff.rs/docs/
 line-length = 100
+
+[tool.ruff.lint]
 ignore = ["F401"]
-ignore-init-module-imports = true

 [build-system]
 requires = ["poetry-core"]
 build-backend = "poetry.core.masonry.api"
-
-