Secure api v2 (#1050)

## Description Modify endpoints to allow better security for different infrastructure needs and setups ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin.
2025-07-07 20:41:43 +02:00 · 2025-07-07 20:41:43 +02:00 · cb45897d7d
commit cb45897d7d
parent 3c3c89a140
6 changed files with 211 additions and 202 deletions
--- a/.env.template
+++ b/.env.template
@ -1,189 +1,112 @@
 ###############################################################################
 # NOTE: With default settings Cognee only needs an OpenAI LLM_API_KEY to be set.
 #       The rest of the settings don't have to be set.
 #       Default relational database: SQLite
 #       Default vector database   : LanceDB
 #       Default graph database    : Kuzu
 #
 #       These default databases are all file-based, so no extra setup is needed
 #       for local use.
 ###############################################################################
-###
+################################################################################
-### DEV
+#  🧠 LLM Settings
-###
+################################################################################
 TOKENIZERS_PARALLELISM="false"
 ###
 ### LLM
 ###
 ###
 ### simple, "expensive", an OpenAPI key
 ###
 LLM_API_KEY="your_api_key"
-
+LLM_MODEL="openai/gpt-4o-mini"
-###
+LLM_PROVIDER="openai"
-### DEV LLM, cheap with content filters
+LLM_ENDPOINT=""
-###
+LLM_API_VERSION=""
 LLM_MODEL="azure/gpt-4o-mini"
 LLM_ENDPOINT="https://DNS.azure.com/openai/deployments/gpt-4o-mini"
 LLM_API_KEY="<<TALK TO YOUR AZURE GUY"
 LLM_API_VERSION="2024-12-01-preview"
 #llm api version might not be relevant
 LLM_MAX_TOKENS="16384"
 EMBEDDING_MODEL="azure/text-embedding-3-large"
 EMBEDDING_ENDPOINT="https://DNS.openai.azure.com/openai/deployments/text-embedding-3-large"
 EMBEDDING_API_KEY="<<TALK TO YOUR AZURE GUY>"
 EMBEDDING_API_VERSION="2024-12-01-preview"
 EMBEDDING_DIMENSIONS=3072
 EMBEDDING_MAX_TOKENS=8191
 ###
 ### free local LLM, install it
 ###
 LLM_API_KEY = "ollama"
 LLM_MODEL = "llama3.1:8b"
 LLM_PROVIDER = "ollama"
 LLM_ENDPOINT = "http://localhost:11434/v1"
 EMBEDDING_PROVIDER = "ollama"
 EMBEDDING_MODEL = "avr/sfr-embedding-mistral:latest"
 EMBEDDING_ENDPOINT = "http://localhost:11434/api/embeddings"
 EMBEDDING_DIMENSIONS = 4096
 HUGGINGFACE_TOKENIZER = "Salesforce/SFR-Embedding-Mistral"
 ###
 ### openrouter, also frewe
 ###
 LLM_API_KEY="<<go-get-one-yourself"
 LLM_PROVIDER="custom"
 LLM_MODEL="openrouter/google/gemini-2.0-flash-lite-preview-02-05:free"
 LLM_ENDPOINT="https://openrouter.ai/api/v1"
 ###
 ### deepinfra
 ###
 LLM_API_KEY="<<>>"
 LLM_PROVIDER="custom"
 LLM_MODEL="deepinfra/meta-llama/Meta-Llama-3-8B-Instruct"
 LLM_ENDPOINT="https://api.deepinfra.com/v1/openai"
 EMBEDDING_PROVIDER="openai"
-EMBEDDING_API_KEY="<<>>"
+EMBEDDING_MODEL="openai/text-embedding-3-large"
 EMBEDDING_MODEL="deepinfra/BAAI/bge-base-en-v1.5"
 EMBEDDING_ENDPOINT=""
 EMBEDDING_API_VERSION=""
 EMBEDDING_DIMENSIONS=3072
 EMBEDDING_MAX_TOKENS=8191
 # If embedding key is not provided same key set for LLM_API_KEY will be used
 #EMBEDDING_API_KEY="your_api_key"
-###
+################################################################################
-### DB
+# 🗄️ Relational database settings
-###
+################################################################################
 ###
 ### db minimal/default
 ###
 GRAPH_DATABASE_PROVIDER="networkx"
 VECTOR_DB_PROVIDER="lancedb"
 DB_PROVIDER=sqlite
 DB_NAME=cognee_db
 ###
 ### Relational options
 ###
 DB_PROVIDER="sqlite"
 DB_NAME=cognee_db
-DB_PROVIDER=postgres
+# -- To switch to Postgres / PGVector, uncomment and fill these: -------------
-DB_NAME=cognee_db
+#DB_PROVIDER=postgres
-DB_HOST=127.0.0.1
+#DB_NAME=cognee_db
-DB_PORT=5432
+# To use Postgres with the Cognee backend in Docker compose use the following instead: DB_HOST=host.docker.internal
-DB_USERNAME=cognee
+#DB_HOST=127.0.0.1
-DB_PASSWORD=cognee
+#DB_PORT=5432
 #DB_USERNAME=cognee
 #DB_PASSWORD=cognee
-###
+################################################################################
-### Graph options
+# 🕸️ Graph Database settings
-###
+################################################################################
 #Default
 # Default (local file-based)
 GRAPH_DATABASE_PROVIDER="kuzu"
-#or if using remote
+# -- To switch to Remote Kuzu uncomment and fill these: -------------------------------------------------------------
 #GRAPH_DATABASE_PROVIDER="kuzu"
 #GRAPH_DATABASE_PROVIDER="kuzu-remote"
 #GRAPH_DATABASE_URL="http://localhost:8000"
 #GRAPH_DATABASE_USERNAME=XXX
 #GRAPH_DATABASE_PASSWORD=YYY
-GRAPH_DATABASE_PROVIDER="kuzu"
+# -- To switch to Neo4j uncomment and fill these: -------------------------------------------------------------------
-GRAPH_DATABASE_PROVIDER="kuzu-remote"
+#GRAPH_DATABASE_PROVIDER="neo4j"
-GRAPH_DATABASE_URL="http://localhost:8000"
+#GRAPH_DATABASE_URL=bolt://localhost:7687
-GRAPH_DATABASE_USERNAME=XXX
+#GRAPH_DATABASE_USERNAME=neo4j
-GRAPH_DATABASE_PASSWORD=YYY
+#GRAPH_DATABASE_PASSWORD=localneo4j
-# or if using neo4j
+################################################################################
-
+#  📐 Vector Database settings
-GRAPH_DATABASE_PROVIDER="neo4j"
+################################################################################
 GRAPH_DATABASE_URL=bolt://localhost:7687
 GRAPH_DATABASE_USERNAME=neo4j
 GRAPH_DATABASE_PASSWORD=localneo4j
 ###
 ### Vector options
 ###
 # Supported providers: pgvector | qdrant | weaviate | milvus | lancedb | chromadb
 VECTOR_DB_PROVIDER="lancedb"
 # Not needed if a cloud vector database is not used
 VECTOR_DB_URL=
 VECTOR_DB_KEY=
-VECTOR_DB_PROVIDER="pgvector"
+################################################################################
 #  📂 ROOT DIRECTORY IF USING COGNEE LIB INSIDE A DOCKER
 ################################################################################
 # Set up the Cognee system directory. Cognee will store system files and databases here.
 DATA_ROOT_DIRECTORY='/cognee_data/data'
 SYSTEM_ROOT_DIRECTORY='/cognee_data/system'
 ###
 ### for release test
 ###
-LLM_API_KEY="..."
+################################################################################
 #  🔄  MIGRATION (RELATIONAL → GRAPH) SETTINGS
 ################################################################################
-OPENAI_API_KEY="..."
+MIGRATION_DB_PATH="/path/to/migration/directory"
-
+MIGRATION_DB_NAME="migration_database.sqlite"
 MIGRATION_DB_PATH="~/Downloads/"
 MIGRATION_DB_NAME="Chinook_Sqlite.sqlite"
 MIGRATION_DB_PROVIDER="sqlite"
-GRAPH_DATABASE_URL="bolt://54.246.89.112:7687"
+# -- Postgres-specific migration params --------------------------------------
 GRAPH_DATABASE_USERNAME="neo4j"
 GRAPH_DATABASE_PASSWORD="pleaseletmein"
 ###
 ### ROOT DIRECTORY IF USING COGNEE LIB INSIDE A DOCKER
 ###
 # Set up the Cognee system directory. Cognee will store system files and databases here.
 DATA_ROOT_DIRECTORY ='/cognee_data/data'
 SYSTEM_ROOT_DIRECTORY= '/cognee_data/system'
 # Postgres specific parameters (Only if Postgres or PGVector is used). Do not use for cognee default simplest setup of SQLite-NetworkX-LanceDB
 # DB_USERNAME=cognee
 # DB_PASSWORD=cognee
 # To use Postgres with the Cognee backend in Docker compose use the following instead: DB_HOST=host.docker.internal
 # DB_HOST=127.0.0.1
 # DB_PORT=5432
 # Params for migrating relational database data to graph / Cognee ( PostgreSQL and SQLite supported )
 # MIGRATION_DB_PATH="/path/to/migration/directory"
 # MIGRATION_DB_NAME="migration_database.sqlite"
 # MIGRATION_DB_PROVIDER="sqlite"
 # Postgres specific parameters for migration
 # MIGRATION_DB_USERNAME=cognee
 # MIGRATION_DB_PASSWORD=cognee
 # MIGRATION_DB_HOST="127.0.0.1"
 # MIGRATION_DB_PORT=5432
-# LITELLM Logging Level. Set to quiten down logging
+################################################################################
-LITELLM_LOG="ERROR"
+# 🔒 Security Settings
 ################################################################################
-# Set this environment variable to disable sending telemetry data
+# When set to false don't allow adding of local system files to Cognee. Should be set to False when Cognee is used as a backend.
-# TELEMETRY_DISABLED=1
+ACCEPT_LOCAL_FILE_PATH=True
 # When set to false don't allow HTTP requests to be sent from Cognee.
 # This protects against Server Side Request Forgery when proper infrastructure is not in place.
 ALLOW_HTTP_REQUESTS=True
 # Set this variable to True to enforce usage of backend access control for Cognee
 # Note: This is only currently supported by the following databases:
@ -194,3 +117,94 @@ LITELLM_LOG="ERROR"
 # It enforces LanceDB and KuzuDB use and uses them to create databases per Cognee user + dataset
 ENABLE_BACKEND_ACCESS_CONTROL=False
 ################################################################################
 #  🛠️ DEV Settings
 ################################################################################
 ENV="local"
 TOKENIZERS_PARALLELISM="false"
 # LITELLM Logging Level. Set to quiet down logging
 LITELLM_LOG="ERROR"
 # Set this environment variable to disable sending telemetry data
 # TELEMETRY_DISABLED=1
 # Default User Configuration
 # DEFAULT_USER_EMAIL=""
 # DEFAULT_USER_PASSWORD=""
 ------------------------------- END OF POSSIBLE SETTINGS -------------------------------
 ###############################################################################
 # 🧪  EXAMPLE OVERRIDES (commented out)
 ###############################################################################
 # The blocks below show how to configure alternative providers.
 # Uncomment + fill values to switch.
 ########## Azure OpenAI #######################################################
 #LLM_MODEL="azure/gpt-4o-mini"
 #LLM_ENDPOINT="https://DNS.azure.com/openai/deployments/gpt-4o-mini"
 #LLM_API_KEY="<<TALK TO YOUR AZURE GUY"
 #LLM_API_VERSION="2024-12-01-preview"
 ## llm api version might not be relevant
 #LLM_MAX_TOKENS="16384"
 #EMBEDDING_MODEL="azure/text-embedding-3-large"
 #EMBEDDING_ENDPOINT="https://DNS.openai.azure.com/openai/deployments/text-embedding-3-large"
 #EMBEDDING_API_KEY="<<TALK TO YOUR AZURE GUY>"
 #EMBEDDING_API_VERSION="2024-12-01-preview"
 #EMBEDDING_DIMENSIONS=3072
 #EMBEDDING_MAX_TOKENS=8191
 ########## Local LLM via Ollama ###############################################
 #LLM_API_KEY ="ollama"
 #LLM_MODEL="llama3.1:8b"
 #LLM_PROVIDER="ollama"
 #LLM_ENDPOINT="http://localhost:11434/v1"
 #EMBEDDING_PROVIDER="ollama"
 #EMBEDDING_MODEL="avr/sfr-embedding-mistral:latest"
 #EMBEDDING_ENDPOINT="http://localhost:11434/api/embeddings"
 #EMBEDDING_DIMENSIONS=4096
 #HUGGINGFACE_TOKENIZER="Salesforce/SFR-Embedding-Mistral"
 ########## OpenRouter (also free) #########################################################
 #LLM_API_KEY="<<go-get-one-yourself"
 #LLM_PROVIDER="custom"
 #LLM_MODEL="openrouter/google/gemini-2.0-flash-lite-preview-02-05:free"
 #LLM_ENDPOINT="https://openrouter.ai/api/v1"
 ########## DeepInfra ##########################################################
 #LLM_API_KEY="<<>>"
 #LLM_PROVIDER="custom"
 #LLM_MODEL="deepinfra/meta-llama/Meta-Llama-3-8B-Instruct"
 #LLM_ENDPOINT="https://api.deepinfra.com/v1/openai"
 #EMBEDDING_PROVIDER="openai"
 #EMBEDDING_API_KEY="<<>>"
 #EMBEDDING_MODEL="deepinfra/BAAI/bge-base-en-v1.5"
 #EMBEDDING_ENDPOINT=""
 #EMBEDDING_API_VERSION=""
 #EMBEDDING_DIMENSIONS=3072
 #EMBEDDING_MAX_TOKENS=8191
 ########## Release Test ###############################################
 #LLM_API_KEY="..."
 #OPENAI_API_KEY="..."
 #MIGRATION_DB_PATH="~/Downloads/"
 #MIGRATION_DB_NAME="Chinook_Sqlite.sqlite"
 #MIGRATION_DB_PROVIDER="sqlite"
 #GRAPH_DATABASE_URL="bolt://54.246.89.112:7687"
 #GRAPH_DATABASE_USERNAME="neo4j"
 #GRAPH_DATABASE_PASSWORD="pleaseletmein"
--- a/cognee/api/v1/add/routers/get_add_router.py
+++ b/cognee/api/v1/add/routers/get_add_router.py
@ -31,6 +31,7 @@ def get_add_router() -> APIRouter:
            raise ValueError("Either datasetId or datasetName must be provided.")
        try:
            # TODO: Add check if HTTP Requests are enabled before allowing requests and git clone
            if isinstance(data, str) and data.startswith("http"):
                if "github" in data:
                    # Perform git clone if the URL is from GitHub
--- a/cognee/api/v1/delete/delete.py
+++ b/cognee/api/v1/delete/delete.py
@ -5,13 +5,16 @@ from sqlalchemy import select
 from sqlalchemy.sql import delete as sql_delete
 from cognee.modules.data.models import Data, DatasetData, Dataset
 from cognee.infrastructure.databases.graph import get_graph_engine
-from io import StringIO, BytesIO
+from io import BytesIO
 import hashlib
 import asyncio
 from uuid import UUID
 from cognee.modules.users.models import User
 from cognee.infrastructure.databases.vector import get_vector_engine
 from cognee.infrastructure.engine import DataPoint
 from cognee.modules.graph.utils.convert_node_to_data_point import get_all_subclasses
 from cognee.modules.users.methods import get_default_user
 from cognee.modules.data.methods import get_authorized_existing_datasets
 from cognee.context_global_variables import set_database_global_context_variables
 from .exceptions import DocumentNotFoundError, DatasetNotFoundError, DocumentSubgraphNotFoundError
 from cognee.shared.logging_utils import get_logger
@ -26,7 +29,9 @@ def get_text_content_hash(text: str) -> str:
 async def delete(
    data: Union[BinaryIO, List[BinaryIO], str, List[str]],
    dataset_name: str = "main_dataset",
    dataset_id: UUID = None,
    mode: str = "soft",
    user: User = None,
 ):
    """Delete a document and all its related nodes from both relational and graph databases.
@ -34,15 +39,27 @@ async def delete(
        data: The data to delete (file, URL, or text)
        dataset_name: Name of the dataset to delete from
        mode: "soft" (default) or "hard" - hard mode also deletes degree-one entity nodes
        user: User doing the operation, if none default user will be used.
    """
    if user is None:
        user = await get_default_user()
    # Verify user has permission to work with given dataset. If dataset_id is given use it, if not use dataset_name
    dataset = await get_authorized_existing_datasets(
        [dataset_id] if dataset_id else [dataset_name], "delete", user
    )
    # Will only be used if ENABLE_BACKEND_ACCESS_CONTROL is set to True
    await set_database_global_context_variables(dataset[0].id, dataset[0].owner_id)
    # Handle different input types
    if isinstance(data, str):
        if data.startswith("file://"):  # It's a file path
            with open(data.replace("file://", ""), mode="rb") as file:
                classified_data = classify(file)
                content_hash = classified_data.get_metadata()["content_hash"]
-                return await delete_single_document(content_hash, dataset_name, mode)
+                return await delete_single_document(content_hash, dataset[0].id, mode)
        elif data.startswith("http"):  # It's a URL
            import requests
@ -51,26 +68,26 @@ async def delete(
            file_data = BytesIO(response.content)
            classified_data = classify(file_data)
            content_hash = classified_data.get_metadata()["content_hash"]
-            return await delete_single_document(content_hash, dataset_name, mode)
+            return await delete_single_document(content_hash, dataset[0].id, mode)
        else:  # It's a text string
            content_hash = get_text_content_hash(data)
            classified_data = classify(data)
-            return await delete_single_document(content_hash, dataset_name, mode)
+            return await delete_single_document(content_hash, dataset[0].id, mode)
    elif isinstance(data, list):
        # Handle list of inputs sequentially
        results = []
        for item in data:
-            result = await delete(item, dataset_name, mode)
+            result = await delete(item, dataset_name, dataset[0].id, mode)
            results.append(result)
        return {"status": "success", "message": "Multiple documents deleted", "results": results}
    else:  # It's already a BinaryIO
        data.seek(0)  # Ensure we're at the start of the file
        classified_data = classify(data)
        content_hash = classified_data.get_metadata()["content_hash"]
-        return await delete_single_document(content_hash, dataset_name, mode)
+        return await delete_single_document(content_hash, dataset[0].id, mode)
-async def delete_single_document(content_hash: str, dataset_name: str, mode: str = "soft"):
+async def delete_single_document(content_hash: str, dataset_id: UUID = None, mode: str = "soft"):
    """Delete a single document by its content hash."""
    # Delete from graph database
@ -157,11 +174,11 @@ async def delete_single_document(content_hash: str, dataset_name: str, mode: str
        # Get the dataset
        dataset = (
-            await session.execute(select(Dataset).filter(Dataset.name == dataset_name))
+            await session.execute(select(Dataset).filter(Dataset.id == dataset_id))
        ).scalar_one_or_none()
        if dataset is None:
-            raise DatasetNotFoundError(f"Dataset not found: {dataset_name}")
+            raise DatasetNotFoundError(f"Dataset not found: {dataset_id}")
        # Delete from dataset_data table
        dataset_delete_stmt = sql_delete(DatasetData).where(
@ -186,7 +203,7 @@ async def delete_single_document(content_hash: str, dataset_name: str, mode: str
        "message": "Document deleted from both graph and relational databases",
        "graph_deletions": deletion_result["deleted_counts"],
        "content_hash": content_hash,
-        "dataset": dataset_name,
+        "dataset": dataset_id,
        "deleted_node_ids": [
            str(node_id) for node_id in deleted_node_ids
        ],  # Convert back to strings for response
--- a/cognee/api/v1/delete/routers/get_delete_router.py
+++ b/cognee/api/v1/delete/routers/get_delete_router.py
@ -1,7 +1,8 @@
 from fastapi import Form, UploadFile, Depends
 from fastapi.responses import JSONResponse
 from fastapi import APIRouter
-from typing import List, Optional
+from typing import List
 from uuid import UUID
 import subprocess
 from cognee.shared.logging_utils import get_logger
 import requests
@ -18,6 +19,7 @@ def get_delete_router() -> APIRouter:
    async def delete(
        data: List[UploadFile],
        dataset_name: str = Form("main_dataset"),
        dataset_id: UUID = None,
        mode: str = Form("soft"),
        user: User = Depends(get_authenticated_user),
    ):
@ -35,6 +37,7 @@ def get_delete_router() -> APIRouter:
            # Handle each file in the list
            results = []
            for file in data:
                # TODO: Add check if HTTP Requests are enabled before allowing requests and git clone
                if file.filename.startswith("http"):
                    if "github" in file.filename:
                        # For GitHub repos, we need to get the content hash of each file
@ -54,12 +57,22 @@ def get_delete_router() -> APIRouter:
                        response.raise_for_status()
                        file_data = response.content
                        result = await cognee_delete(
-                            file_data, dataset_name=dataset_name, mode=mode
+                            file_data,
                            dataset_name=dataset_name,
                            dataset_id=dataset_id,
                            mode=mode,
                            user=user,
                        )
                        results.append(result)
                else:
                    # Handle uploaded file by accessing its file attribute
-                    result = await cognee_delete(file.file, dataset_name=dataset_name, mode=mode)
+                    result = await cognee_delete(
                        file.file,
                        dataset_name=dataset_name,
                        dataset_id=dataset_id,
                        mode=mode,
                        user=user,
                    )
                    results.append(result)
            if len(results) == 1:
--- a/cognee/infrastructure/engine/models/DataPoint.py
+++ b/cognee/infrastructure/engine/models/DataPoint.py
@ -176,43 +176,6 @@ class DataPoint(BaseModel):
        """
        return self.model_validate_json(json_str)
    # Pickle Serialization
    def to_pickle(self) -> bytes:
        """
        Serialize the DataPoint instance to a byte format for pickling.
        This method uses the built-in Python pickle module to convert the instance into a byte
        stream for persistence or transmission.
        Returns:
        --------
            - bytes: The pickled byte representation of the DataPoint instance.
        """
        return pickle.dumps(self.dict())
    @classmethod
    def from_pickle(self, pickled_data: bytes):
        """
        Deserialize a DataPoint instance from a pickled byte stream.
        The method converts the byte stream back into a DataPoint instance by loading the data
        and validating it through the model's constructor.
        Parameters:
        -----------
            - pickled_data (bytes): The bytes representation of a pickled DataPoint instance to
              be deserialized.
        Returns:
        --------
            A new DataPoint instance created from the pickled data.
        """
        data = pickle.loads(pickled_data)
        return self(**data)
    def to_dict(self, **kwargs) -> Dict[str, Any]:
        """
        Convert the DataPoint instance to a dictionary representation.
--- a/cognee/tasks/ingestion/save_data_item_to_storage.py
+++ b/cognee/tasks/ingestion/save_data_item_to_storage.py
@ -20,6 +20,7 @@ async def save_data_item_to_storage(data_item: Union[BinaryIO, str, Any], datase
            file_path = data_item
        # data is a file path
        elif data_item.startswith("file://") or data_item.startswith("/"):
            # TODO: Add check if ACCEPT_LOCAL_FILE_PATH is enabled, if it's not raise an error
            file_path = data_item.replace("file://", "")
        # data is text
        else: