Secure api v2 (#1050)

## Description Modify endpoints to allow better security for different infrastructure needs and setups ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin.
2025-07-07 20:41:43 +02:00 · 2025-07-07 20:41:43 +02:00 · cb45897d7d
commit cb45897d7d
parent 3c3c89a140
6 changed files with 211 additions and 202 deletions
--- a/.env.template
+++ b/.env.template
@ -1,189 +1,112 @@
+###############################################################################
+# NOTE: With default settings Cognee only needs an OpenAI LLM_API_KEY to be set.
+#       The rest of the settings don't have to be set.
+#       Default relational database: SQLite
+#       Default vector database   : LanceDB
+#       Default graph database    : Kuzu
+#
+#       These default databases are all file-based, so no extra setup is needed
+#       for local use.
+###############################################################################

-###
-### DEV
-###
-
-
-TOKENIZERS_PARALLELISM="false"
-
-###
-### LLM
-###
-
-###
-### simple, "expensive", an OpenAPI key
-###
+################################################################################
+#  🧠 LLM Settings
+################################################################################

 LLM_API_KEY="your_api_key"
-
-###
-### DEV LLM, cheap with content filters
-###
-
-LLM_MODEL="azure/gpt-4o-mini"
-LLM_ENDPOINT="https://DNS.azure.com/openai/deployments/gpt-4o-mini"
-LLM_API_KEY="<<TALK TO YOUR AZURE GUY"
-LLM_API_VERSION="2024-12-01-preview"
-#llm api version might not be relevant
+LLM_MODEL="openai/gpt-4o-mini"
+LLM_PROVIDER="openai"
+LLM_ENDPOINT=""
+LLM_API_VERSION=""
 LLM_MAX_TOKENS="16384"

-EMBEDDING_MODEL="azure/text-embedding-3-large"
-EMBEDDING_ENDPOINT="https://DNS.openai.azure.com/openai/deployments/text-embedding-3-large"
-EMBEDDING_API_KEY="<<TALK TO YOUR AZURE GUY>"
-EMBEDDING_API_VERSION="2024-12-01-preview"
-EMBEDDING_DIMENSIONS=3072
-EMBEDDING_MAX_TOKENS=8191
-
-###
-### free local LLM, install it
-###
-
-LLM_API_KEY = "ollama"
-LLM_MODEL = "llama3.1:8b"
-LLM_PROVIDER = "ollama"
-LLM_ENDPOINT = "http://localhost:11434/v1"
-EMBEDDING_PROVIDER = "ollama"
-EMBEDDING_MODEL = "avr/sfr-embedding-mistral:latest"
-EMBEDDING_ENDPOINT = "http://localhost:11434/api/embeddings"
-EMBEDDING_DIMENSIONS = 4096
-HUGGINGFACE_TOKENIZER = "Salesforce/SFR-Embedding-Mistral"
-
-###
-### openrouter, also frewe
-###
-
-LLM_API_KEY="<<go-get-one-yourself"
-LLM_PROVIDER="custom"
-LLM_MODEL="openrouter/google/gemini-2.0-flash-lite-preview-02-05:free"
-LLM_ENDPOINT="https://openrouter.ai/api/v1"
-
-###
-### deepinfra
-###
-
-LLM_API_KEY="<<>>"
-LLM_PROVIDER="custom"
-LLM_MODEL="deepinfra/meta-llama/Meta-Llama-3-8B-Instruct"
-LLM_ENDPOINT="https://api.deepinfra.com/v1/openai"
-
 EMBEDDING_PROVIDER="openai"
-EMBEDDING_API_KEY="<<>>"
-EMBEDDING_MODEL="deepinfra/BAAI/bge-base-en-v1.5"
+EMBEDDING_MODEL="openai/text-embedding-3-large"
 EMBEDDING_ENDPOINT=""
 EMBEDDING_API_VERSION=""
 EMBEDDING_DIMENSIONS=3072
 EMBEDDING_MAX_TOKENS=8191
+# If embedding key is not provided same key set for LLM_API_KEY will be used
+#EMBEDDING_API_KEY="your_api_key"

-###
-### DB
-###
-
-###
-### db minimal/default
-###
-
-GRAPH_DATABASE_PROVIDER="networkx"
-VECTOR_DB_PROVIDER="lancedb"
-DB_PROVIDER=sqlite
-DB_NAME=cognee_db
-
-###
-### Relational options
-###
+################################################################################
+# 🗄️ Relational database settings
+################################################################################

 DB_PROVIDER="sqlite"
 DB_NAME=cognee_db

-DB_PROVIDER=postgres
-DB_NAME=cognee_db
-DB_HOST=127.0.0.1
-DB_PORT=5432
-DB_USERNAME=cognee
-DB_PASSWORD=cognee
+# -- To switch to Postgres / PGVector, uncomment and fill these: -------------
+#DB_PROVIDER=postgres
+#DB_NAME=cognee_db
+# To use Postgres with the Cognee backend in Docker compose use the following instead: DB_HOST=host.docker.internal
+#DB_HOST=127.0.0.1
+#DB_PORT=5432
+#DB_USERNAME=cognee
+#DB_PASSWORD=cognee

-###
-### Graph options
-###
-
-
-#Default
+################################################################################
+# 🕸️ Graph Database settings
+################################################################################

+# Default (local file-based)
 GRAPH_DATABASE_PROVIDER="kuzu"

-#or if using remote
+# -- To switch to Remote Kuzu uncomment and fill these: -------------------------------------------------------------
+#GRAPH_DATABASE_PROVIDER="kuzu"
+#GRAPH_DATABASE_PROVIDER="kuzu-remote"
+#GRAPH_DATABASE_URL="http://localhost:8000"
+#GRAPH_DATABASE_USERNAME=XXX
+#GRAPH_DATABASE_PASSWORD=YYY

-GRAPH_DATABASE_PROVIDER="kuzu"
-GRAPH_DATABASE_PROVIDER="kuzu-remote"
-GRAPH_DATABASE_URL="http://localhost:8000"
-GRAPH_DATABASE_USERNAME=XXX
-GRAPH_DATABASE_PASSWORD=YYY
+# -- To switch to Neo4j uncomment and fill these: -------------------------------------------------------------------
+#GRAPH_DATABASE_PROVIDER="neo4j"
+#GRAPH_DATABASE_URL=bolt://localhost:7687
+#GRAPH_DATABASE_USERNAME=neo4j
+#GRAPH_DATABASE_PASSWORD=localneo4j

-# or if using neo4j
-
-GRAPH_DATABASE_PROVIDER="neo4j"
-GRAPH_DATABASE_URL=bolt://localhost:7687
-GRAPH_DATABASE_USERNAME=neo4j
-GRAPH_DATABASE_PASSWORD=localneo4j
-
-###
-### Vector options
-###
+################################################################################
+#  📐 Vector Database settings
+################################################################################

+# Supported providers: pgvector | qdrant | weaviate | milvus | lancedb | chromadb
 VECTOR_DB_PROVIDER="lancedb"
+# Not needed if a cloud vector database is not used
+VECTOR_DB_URL=
+VECTOR_DB_KEY=

-VECTOR_DB_PROVIDER="pgvector"
-
-###
-### for release test
-###
-
-LLM_API_KEY="..."
-
-OPENAI_API_KEY="..."
-
-MIGRATION_DB_PATH="~/Downloads/"
-MIGRATION_DB_NAME="Chinook_Sqlite.sqlite"
-MIGRATION_DB_PROVIDER="sqlite"
-
-GRAPH_DATABASE_URL="bolt://54.246.89.112:7687"
-GRAPH_DATABASE_USERNAME="neo4j"
-GRAPH_DATABASE_PASSWORD="pleaseletmein"
-
-###
-### ROOT DIRECTORY IF USING COGNEE LIB INSIDE A DOCKER
-###
+################################################################################
+#  📂 ROOT DIRECTORY IF USING COGNEE LIB INSIDE A DOCKER
+################################################################################
 # Set up the Cognee system directory. Cognee will store system files and databases here.
-
-
 DATA_ROOT_DIRECTORY='/cognee_data/data'
 SYSTEM_ROOT_DIRECTORY='/cognee_data/system'


+################################################################################
+#  🔄  MIGRATION (RELATIONAL → GRAPH) SETTINGS
+################################################################################

-# Postgres specific parameters (Only if Postgres or PGVector is used). Do not use for cognee default simplest setup of SQLite-NetworkX-LanceDB
-# DB_USERNAME=cognee
-# DB_PASSWORD=cognee
-# To use Postgres with the Cognee backend in Docker compose use the following instead: DB_HOST=host.docker.internal
-# DB_HOST=127.0.0.1
-# DB_PORT=5432
+MIGRATION_DB_PATH="/path/to/migration/directory"
+MIGRATION_DB_NAME="migration_database.sqlite"
+MIGRATION_DB_PROVIDER="sqlite"

-
-
-# Params for migrating relational database data to graph / Cognee ( PostgreSQL and SQLite supported )
-# MIGRATION_DB_PATH="/path/to/migration/directory"
-# MIGRATION_DB_NAME="migration_database.sqlite"
-# MIGRATION_DB_PROVIDER="sqlite"
-# Postgres specific parameters for migration
+# -- Postgres-specific migration params --------------------------------------
 # MIGRATION_DB_USERNAME=cognee
 # MIGRATION_DB_PASSWORD=cognee
 # MIGRATION_DB_HOST="127.0.0.1"
 # MIGRATION_DB_PORT=5432

-# LITELLM Logging Level. Set to quiten down logging
-LITELLM_LOG="ERROR"
+################################################################################
+# 🔒 Security Settings
+################################################################################

-# Set this environment variable to disable sending telemetry data
-# TELEMETRY_DISABLED=1
+# When set to false don't allow adding of local system files to Cognee. Should be set to False when Cognee is used as a backend.
+ACCEPT_LOCAL_FILE_PATH=True
+
+# When set to false don't allow HTTP requests to be sent from Cognee.
+# This protects against Server Side Request Forgery when proper infrastructure is not in place.
+ALLOW_HTTP_REQUESTS=True

 # Set this variable to True to enforce usage of backend access control for Cognee
 # Note: This is only currently supported by the following databases:
@ -194,3 +117,94 @@ LITELLM_LOG="ERROR"
 # It enforces LanceDB and KuzuDB use and uses them to create databases per Cognee user + dataset
 ENABLE_BACKEND_ACCESS_CONTROL=False

+################################################################################
+#  🛠️ DEV Settings
+################################################################################
+
+ENV="local"
+
+TOKENIZERS_PARALLELISM="false"
+
+# LITELLM Logging Level. Set to quiet down logging
+LITELLM_LOG="ERROR"
+
+# Set this environment variable to disable sending telemetry data
+# TELEMETRY_DISABLED=1
+
+# Default User Configuration
+# DEFAULT_USER_EMAIL=""
+# DEFAULT_USER_PASSWORD=""
+
+------------------------------- END OF POSSIBLE SETTINGS -------------------------------
+
+
+###############################################################################
+# 🧪  EXAMPLE OVERRIDES (commented out)
+###############################################################################
+# The blocks below show how to configure alternative providers.
+# Uncomment + fill values to switch.
+
+########## Azure OpenAI #######################################################
+#LLM_MODEL="azure/gpt-4o-mini"
+#LLM_ENDPOINT="https://DNS.azure.com/openai/deployments/gpt-4o-mini"
+#LLM_API_KEY="<<TALK TO YOUR AZURE GUY"
+#LLM_API_VERSION="2024-12-01-preview"
+
+## llm api version might not be relevant
+#LLM_MAX_TOKENS="16384"
+
+#EMBEDDING_MODEL="azure/text-embedding-3-large"
+#EMBEDDING_ENDPOINT="https://DNS.openai.azure.com/openai/deployments/text-embedding-3-large"
+#EMBEDDING_API_KEY="<<TALK TO YOUR AZURE GUY>"
+#EMBEDDING_API_VERSION="2024-12-01-preview"
+#EMBEDDING_DIMENSIONS=3072
+#EMBEDDING_MAX_TOKENS=8191
+
+########## Local LLM via Ollama ###############################################
+
+#LLM_API_KEY ="ollama"
+#LLM_MODEL="llama3.1:8b"
+#LLM_PROVIDER="ollama"
+#LLM_ENDPOINT="http://localhost:11434/v1"
+#EMBEDDING_PROVIDER="ollama"
+#EMBEDDING_MODEL="avr/sfr-embedding-mistral:latest"
+#EMBEDDING_ENDPOINT="http://localhost:11434/api/embeddings"
+#EMBEDDING_DIMENSIONS=4096
+#HUGGINGFACE_TOKENIZER="Salesforce/SFR-Embedding-Mistral"
+
+########## OpenRouter (also free) #########################################################
+
+#LLM_API_KEY="<<go-get-one-yourself"
+#LLM_PROVIDER="custom"
+#LLM_MODEL="openrouter/google/gemini-2.0-flash-lite-preview-02-05:free"
+#LLM_ENDPOINT="https://openrouter.ai/api/v1"
+
+########## DeepInfra ##########################################################
+
+#LLM_API_KEY="<<>>"
+#LLM_PROVIDER="custom"
+#LLM_MODEL="deepinfra/meta-llama/Meta-Llama-3-8B-Instruct"
+#LLM_ENDPOINT="https://api.deepinfra.com/v1/openai"
+
+#EMBEDDING_PROVIDER="openai"
+#EMBEDDING_API_KEY="<<>>"
+#EMBEDDING_MODEL="deepinfra/BAAI/bge-base-en-v1.5"
+#EMBEDDING_ENDPOINT=""
+#EMBEDDING_API_VERSION=""
+#EMBEDDING_DIMENSIONS=3072
+#EMBEDDING_MAX_TOKENS=8191
+
+
+########## Release Test ###############################################
+
+#LLM_API_KEY="..."
+
+#OPENAI_API_KEY="..."
+
+#MIGRATION_DB_PATH="~/Downloads/"
+#MIGRATION_DB_NAME="Chinook_Sqlite.sqlite"
+#MIGRATION_DB_PROVIDER="sqlite"
+
+#GRAPH_DATABASE_URL="bolt://54.246.89.112:7687"
+#GRAPH_DATABASE_USERNAME="neo4j"
+#GRAPH_DATABASE_PASSWORD="pleaseletmein"
--- a/cognee/api/v1/add/routers/get_add_router.py
+++ b/cognee/api/v1/add/routers/get_add_router.py
@ -31,6 +31,7 @@ def get_add_router() -> APIRouter:
            raise ValueError("Either datasetId or datasetName must be provided.")

        try:
+            # TODO: Add check if HTTP Requests are enabled before allowing requests and git clone
            if isinstance(data, str) and data.startswith("http"):
                if "github" in data:
                    # Perform git clone if the URL is from GitHub
--- a/cognee/api/v1/delete/delete.py
+++ b/cognee/api/v1/delete/delete.py
@ -5,13 +5,16 @@ from sqlalchemy import select
 from sqlalchemy.sql import delete as sql_delete
 from cognee.modules.data.models import Data, DatasetData, Dataset
 from cognee.infrastructure.databases.graph import get_graph_engine
-from io import StringIO, BytesIO
+from io import BytesIO
 import hashlib
-import asyncio
 from uuid import UUID
+from cognee.modules.users.models import User
 from cognee.infrastructure.databases.vector import get_vector_engine
 from cognee.infrastructure.engine import DataPoint
 from cognee.modules.graph.utils.convert_node_to_data_point import get_all_subclasses
+from cognee.modules.users.methods import get_default_user
+from cognee.modules.data.methods import get_authorized_existing_datasets
+from cognee.context_global_variables import set_database_global_context_variables
 from .exceptions import DocumentNotFoundError, DatasetNotFoundError, DocumentSubgraphNotFoundError
 from cognee.shared.logging_utils import get_logger

@ -26,7 +29,9 @@ def get_text_content_hash(text: str) -> str:
 async def delete(
    data: Union[BinaryIO, List[BinaryIO], str, List[str]],
    dataset_name: str = "main_dataset",
+    dataset_id: UUID = None,
    mode: str = "soft",
+    user: User = None,
 ):
    """Delete a document and all its related nodes from both relational and graph databases.

@ -34,15 +39,27 @@ async def delete(
        data: The data to delete (file, URL, or text)
        dataset_name: Name of the dataset to delete from
        mode: "soft" (default) or "hard" - hard mode also deletes degree-one entity nodes
+        user: User doing the operation, if none default user will be used.
    """

+    if user is None:
+        user = await get_default_user()
+
+    # Verify user has permission to work with given dataset. If dataset_id is given use it, if not use dataset_name
+    dataset = await get_authorized_existing_datasets(
+        [dataset_id] if dataset_id else [dataset_name], "delete", user
+    )
+
+    # Will only be used if ENABLE_BACKEND_ACCESS_CONTROL is set to True
+    await set_database_global_context_variables(dataset[0].id, dataset[0].owner_id)
+
    # Handle different input types
    if isinstance(data, str):
        if data.startswith("file://"):  # It's a file path
            with open(data.replace("file://", ""), mode="rb") as file:
                classified_data = classify(file)
                content_hash = classified_data.get_metadata()["content_hash"]
-                return await delete_single_document(content_hash, dataset_name, mode)
+                return await delete_single_document(content_hash, dataset[0].id, mode)
        elif data.startswith("http"):  # It's a URL
            import requests

@ -51,26 +68,26 @@ async def delete(
            file_data = BytesIO(response.content)
            classified_data = classify(file_data)
            content_hash = classified_data.get_metadata()["content_hash"]
-            return await delete_single_document(content_hash, dataset_name, mode)
+            return await delete_single_document(content_hash, dataset[0].id, mode)
        else:  # It's a text string
            content_hash = get_text_content_hash(data)
            classified_data = classify(data)
-            return await delete_single_document(content_hash, dataset_name, mode)
+            return await delete_single_document(content_hash, dataset[0].id, mode)
    elif isinstance(data, list):
        # Handle list of inputs sequentially
        results = []
        for item in data:
-            result = await delete(item, dataset_name, mode)
+            result = await delete(item, dataset_name, dataset[0].id, mode)
            results.append(result)
        return {"status": "success", "message": "Multiple documents deleted", "results": results}
    else:  # It's already a BinaryIO
        data.seek(0)  # Ensure we're at the start of the file
        classified_data = classify(data)
        content_hash = classified_data.get_metadata()["content_hash"]
-        return await delete_single_document(content_hash, dataset_name, mode)
+        return await delete_single_document(content_hash, dataset[0].id, mode)


-async def delete_single_document(content_hash: str, dataset_name: str, mode: str = "soft"):
+async def delete_single_document(content_hash: str, dataset_id: UUID = None, mode: str = "soft"):
    """Delete a single document by its content hash."""

    # Delete from graph database
@ -157,11 +174,11 @@ async def delete_single_document(content_hash: str, dataset_name: str, mode: str

        # Get the dataset
        dataset = (
-            await session.execute(select(Dataset).filter(Dataset.name == dataset_name))
+            await session.execute(select(Dataset).filter(Dataset.id == dataset_id))
        ).scalar_one_or_none()

        if dataset is None:
-            raise DatasetNotFoundError(f"Dataset not found: {dataset_name}")
+            raise DatasetNotFoundError(f"Dataset not found: {dataset_id}")

        # Delete from dataset_data table
        dataset_delete_stmt = sql_delete(DatasetData).where(
@ -186,7 +203,7 @@ async def delete_single_document(content_hash: str, dataset_name: str, mode: str
        "message": "Document deleted from both graph and relational databases",
        "graph_deletions": deletion_result["deleted_counts"],
        "content_hash": content_hash,
-        "dataset": dataset_name,
+        "dataset": dataset_id,
        "deleted_node_ids": [
            str(node_id) for node_id in deleted_node_ids
        ],  # Convert back to strings for response
--- a/cognee/api/v1/delete/routers/get_delete_router.py
+++ b/cognee/api/v1/delete/routers/get_delete_router.py
@ -1,7 +1,8 @@
 from fastapi import Form, UploadFile, Depends
 from fastapi.responses import JSONResponse
 from fastapi import APIRouter
-from typing import List, Optional
+from typing import List
+from uuid import UUID
 import subprocess
 from cognee.shared.logging_utils import get_logger
 import requests
@ -18,6 +19,7 @@ def get_delete_router() -> APIRouter:
    async def delete(
        data: List[UploadFile],
        dataset_name: str = Form("main_dataset"),
+        dataset_id: UUID = None,
        mode: str = Form("soft"),
        user: User = Depends(get_authenticated_user),
    ):
@ -35,6 +37,7 @@ def get_delete_router() -> APIRouter:
            # Handle each file in the list
            results = []
            for file in data:
+                # TODO: Add check if HTTP Requests are enabled before allowing requests and git clone
                if file.filename.startswith("http"):
                    if "github" in file.filename:
                        # For GitHub repos, we need to get the content hash of each file
@ -54,12 +57,22 @@ def get_delete_router() -> APIRouter:
                        response.raise_for_status()
                        file_data = response.content
                        result = await cognee_delete(
-                            file_data, dataset_name=dataset_name, mode=mode
+                            file_data,
+                            dataset_name=dataset_name,
+                            dataset_id=dataset_id,
+                            mode=mode,
+                            user=user,
                        )
                        results.append(result)
                else:
                    # Handle uploaded file by accessing its file attribute
-                    result = await cognee_delete(file.file, dataset_name=dataset_name, mode=mode)
+                    result = await cognee_delete(
+                        file.file,
+                        dataset_name=dataset_name,
+                        dataset_id=dataset_id,
+                        mode=mode,
+                        user=user,
+                    )
                    results.append(result)

            if len(results) == 1:
--- a/cognee/infrastructure/engine/models/DataPoint.py
+++ b/cognee/infrastructure/engine/models/DataPoint.py
@ -176,43 +176,6 @@ class DataPoint(BaseModel):
        """
        return self.model_validate_json(json_str)

-    # Pickle Serialization
-    def to_pickle(self) -> bytes:
-        """
-        Serialize the DataPoint instance to a byte format for pickling.
-
-        This method uses the built-in Python pickle module to convert the instance into a byte
-        stream for persistence or transmission.
-
-        Returns:
-        --------
-
-            - bytes: The pickled byte representation of the DataPoint instance.
-        """
-        return pickle.dumps(self.dict())
-
-    @classmethod
-    def from_pickle(self, pickled_data: bytes):
-        """
-        Deserialize a DataPoint instance from a pickled byte stream.
-
-        The method converts the byte stream back into a DataPoint instance by loading the data
-        and validating it through the model's constructor.
-
-        Parameters:
-        -----------
-
-            - pickled_data (bytes): The bytes representation of a pickled DataPoint instance to
-              be deserialized.
-
-        Returns:
-        --------
-
-            A new DataPoint instance created from the pickled data.
-        """
-        data = pickle.loads(pickled_data)
-        return self(**data)
-
    def to_dict(self, **kwargs) -> Dict[str, Any]:
        """
        Convert the DataPoint instance to a dictionary representation.
--- a/cognee/tasks/ingestion/save_data_item_to_storage.py
+++ b/cognee/tasks/ingestion/save_data_item_to_storage.py
@ -20,6 +20,7 @@ async def save_data_item_to_storage(data_item: Union[BinaryIO, str, Any], datase
            file_path = data_item
        # data is a file path
        elif data_item.startswith("file://") or data_item.startswith("/"):
+            # TODO: Add check if ACCEPT_LOCAL_FILE_PATH is enabled, if it's not raise an error
            file_path = data_item.replace("file://", "")
        # data is text
        else: