Secure api v2 (#1050)

<!-- .github/pull_request_template.md -->

## Description
Modify endpoints to allow better security for different infrastructure
needs and setups

## DCO Affirmation
I affirm that all code in every commit of this pull request conforms to
the terms of the Topoteretes Developer Certificate of Origin.
This commit is contained in:
Igor Ilic 2025-07-07 20:41:43 +02:00 committed by GitHub
parent 3c3c89a140
commit cb45897d7d
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 211 additions and 202 deletions

View file

@ -1,189 +1,112 @@
###############################################################################
# NOTE: With default settings Cognee only needs an OpenAI LLM_API_KEY to be set.
# The rest of the settings don't have to be set.
# Default relational database: SQLite
# Default vector database : LanceDB
# Default graph database : Kuzu
#
# These default databases are all file-based, so no extra setup is needed
# for local use.
###############################################################################
###
### DEV
###
TOKENIZERS_PARALLELISM="false"
###
### LLM
###
###
### simple, "expensive", an OpenAPI key
###
################################################################################
# 🧠 LLM Settings
################################################################################
LLM_API_KEY="your_api_key"
###
### DEV LLM, cheap with content filters
###
LLM_MODEL="azure/gpt-4o-mini"
LLM_ENDPOINT="https://DNS.azure.com/openai/deployments/gpt-4o-mini"
LLM_API_KEY="<<TALK TO YOUR AZURE GUY"
LLM_API_VERSION="2024-12-01-preview"
#llm api version might not be relevant
LLM_MODEL="openai/gpt-4o-mini"
LLM_PROVIDER="openai"
LLM_ENDPOINT=""
LLM_API_VERSION=""
LLM_MAX_TOKENS="16384"
EMBEDDING_MODEL="azure/text-embedding-3-large"
EMBEDDING_ENDPOINT="https://DNS.openai.azure.com/openai/deployments/text-embedding-3-large"
EMBEDDING_API_KEY="<<TALK TO YOUR AZURE GUY>"
EMBEDDING_API_VERSION="2024-12-01-preview"
EMBEDDING_DIMENSIONS=3072
EMBEDDING_MAX_TOKENS=8191
###
### free local LLM, install it
###
LLM_API_KEY = "ollama"
LLM_MODEL = "llama3.1:8b"
LLM_PROVIDER = "ollama"
LLM_ENDPOINT = "http://localhost:11434/v1"
EMBEDDING_PROVIDER = "ollama"
EMBEDDING_MODEL = "avr/sfr-embedding-mistral:latest"
EMBEDDING_ENDPOINT = "http://localhost:11434/api/embeddings"
EMBEDDING_DIMENSIONS = 4096
HUGGINGFACE_TOKENIZER = "Salesforce/SFR-Embedding-Mistral"
###
### openrouter, also frewe
###
LLM_API_KEY="<<go-get-one-yourself"
LLM_PROVIDER="custom"
LLM_MODEL="openrouter/google/gemini-2.0-flash-lite-preview-02-05:free"
LLM_ENDPOINT="https://openrouter.ai/api/v1"
###
### deepinfra
###
LLM_API_KEY="<<>>"
LLM_PROVIDER="custom"
LLM_MODEL="deepinfra/meta-llama/Meta-Llama-3-8B-Instruct"
LLM_ENDPOINT="https://api.deepinfra.com/v1/openai"
EMBEDDING_PROVIDER="openai"
EMBEDDING_API_KEY="<<>>"
EMBEDDING_MODEL="deepinfra/BAAI/bge-base-en-v1.5"
EMBEDDING_MODEL="openai/text-embedding-3-large"
EMBEDDING_ENDPOINT=""
EMBEDDING_API_VERSION=""
EMBEDDING_DIMENSIONS=3072
EMBEDDING_MAX_TOKENS=8191
# If embedding key is not provided same key set for LLM_API_KEY will be used
#EMBEDDING_API_KEY="your_api_key"
###
### DB
###
###
### db minimal/default
###
GRAPH_DATABASE_PROVIDER="networkx"
VECTOR_DB_PROVIDER="lancedb"
DB_PROVIDER=sqlite
DB_NAME=cognee_db
###
### Relational options
###
################################################################################
# 🗄️ Relational database settings
################################################################################
DB_PROVIDER="sqlite"
DB_NAME=cognee_db
DB_PROVIDER=postgres
DB_NAME=cognee_db
DB_HOST=127.0.0.1
DB_PORT=5432
DB_USERNAME=cognee
DB_PASSWORD=cognee
# -- To switch to Postgres / PGVector, uncomment and fill these: -------------
#DB_PROVIDER=postgres
#DB_NAME=cognee_db
# To use Postgres with the Cognee backend in Docker compose use the following instead: DB_HOST=host.docker.internal
#DB_HOST=127.0.0.1
#DB_PORT=5432
#DB_USERNAME=cognee
#DB_PASSWORD=cognee
###
### Graph options
###
#Default
################################################################################
# 🕸️ Graph Database settings
################################################################################
# Default (local file-based)
GRAPH_DATABASE_PROVIDER="kuzu"
#or if using remote
# -- To switch to Remote Kuzu uncomment and fill these: -------------------------------------------------------------
#GRAPH_DATABASE_PROVIDER="kuzu"
#GRAPH_DATABASE_PROVIDER="kuzu-remote"
#GRAPH_DATABASE_URL="http://localhost:8000"
#GRAPH_DATABASE_USERNAME=XXX
#GRAPH_DATABASE_PASSWORD=YYY
GRAPH_DATABASE_PROVIDER="kuzu"
GRAPH_DATABASE_PROVIDER="kuzu-remote"
GRAPH_DATABASE_URL="http://localhost:8000"
GRAPH_DATABASE_USERNAME=XXX
GRAPH_DATABASE_PASSWORD=YYY
# -- To switch to Neo4j uncomment and fill these: -------------------------------------------------------------------
#GRAPH_DATABASE_PROVIDER="neo4j"
#GRAPH_DATABASE_URL=bolt://localhost:7687
#GRAPH_DATABASE_USERNAME=neo4j
#GRAPH_DATABASE_PASSWORD=localneo4j
# or if using neo4j
GRAPH_DATABASE_PROVIDER="neo4j"
GRAPH_DATABASE_URL=bolt://localhost:7687
GRAPH_DATABASE_USERNAME=neo4j
GRAPH_DATABASE_PASSWORD=localneo4j
###
### Vector options
###
################################################################################
# 📐 Vector Database settings
################################################################################
# Supported providers: pgvector | qdrant | weaviate | milvus | lancedb | chromadb
VECTOR_DB_PROVIDER="lancedb"
# Not needed if a cloud vector database is not used
VECTOR_DB_URL=
VECTOR_DB_KEY=
VECTOR_DB_PROVIDER="pgvector"
###
### for release test
###
LLM_API_KEY="..."
OPENAI_API_KEY="..."
MIGRATION_DB_PATH="~/Downloads/"
MIGRATION_DB_NAME="Chinook_Sqlite.sqlite"
MIGRATION_DB_PROVIDER="sqlite"
GRAPH_DATABASE_URL="bolt://54.246.89.112:7687"
GRAPH_DATABASE_USERNAME="neo4j"
GRAPH_DATABASE_PASSWORD="pleaseletmein"
###
### ROOT DIRECTORY IF USING COGNEE LIB INSIDE A DOCKER
###
################################################################################
# 📂 ROOT DIRECTORY IF USING COGNEE LIB INSIDE A DOCKER
################################################################################
# Set up the Cognee system directory. Cognee will store system files and databases here.
DATA_ROOT_DIRECTORY='/cognee_data/data'
SYSTEM_ROOT_DIRECTORY='/cognee_data/system'
################################################################################
# 🔄 MIGRATION (RELATIONAL → GRAPH) SETTINGS
################################################################################
# Postgres specific parameters (Only if Postgres or PGVector is used). Do not use for cognee default simplest setup of SQLite-NetworkX-LanceDB
# DB_USERNAME=cognee
# DB_PASSWORD=cognee
# To use Postgres with the Cognee backend in Docker compose use the following instead: DB_HOST=host.docker.internal
# DB_HOST=127.0.0.1
# DB_PORT=5432
MIGRATION_DB_PATH="/path/to/migration/directory"
MIGRATION_DB_NAME="migration_database.sqlite"
MIGRATION_DB_PROVIDER="sqlite"
# Params for migrating relational database data to graph / Cognee ( PostgreSQL and SQLite supported )
# MIGRATION_DB_PATH="/path/to/migration/directory"
# MIGRATION_DB_NAME="migration_database.sqlite"
# MIGRATION_DB_PROVIDER="sqlite"
# Postgres specific parameters for migration
# -- Postgres-specific migration params --------------------------------------
# MIGRATION_DB_USERNAME=cognee
# MIGRATION_DB_PASSWORD=cognee
# MIGRATION_DB_HOST="127.0.0.1"
# MIGRATION_DB_PORT=5432
# LITELLM Logging Level. Set to quiten down logging
LITELLM_LOG="ERROR"
################################################################################
# 🔒 Security Settings
################################################################################
# Set this environment variable to disable sending telemetry data
# TELEMETRY_DISABLED=1
# When set to false don't allow adding of local system files to Cognee. Should be set to False when Cognee is used as a backend.
ACCEPT_LOCAL_FILE_PATH=True
# When set to false don't allow HTTP requests to be sent from Cognee.
# This protects against Server Side Request Forgery when proper infrastructure is not in place.
ALLOW_HTTP_REQUESTS=True
# Set this variable to True to enforce usage of backend access control for Cognee
# Note: This is only currently supported by the following databases:
@ -194,3 +117,94 @@ LITELLM_LOG="ERROR"
# It enforces LanceDB and KuzuDB use and uses them to create databases per Cognee user + dataset
ENABLE_BACKEND_ACCESS_CONTROL=False
################################################################################
# 🛠️ DEV Settings
################################################################################
ENV="local"
TOKENIZERS_PARALLELISM="false"
# LITELLM Logging Level. Set to quiet down logging
LITELLM_LOG="ERROR"
# Set this environment variable to disable sending telemetry data
# TELEMETRY_DISABLED=1
# Default User Configuration
# DEFAULT_USER_EMAIL=""
# DEFAULT_USER_PASSWORD=""
------------------------------- END OF POSSIBLE SETTINGS -------------------------------
###############################################################################
# 🧪 EXAMPLE OVERRIDES (commented out)
###############################################################################
# The blocks below show how to configure alternative providers.
# Uncomment + fill values to switch.
########## Azure OpenAI #######################################################
#LLM_MODEL="azure/gpt-4o-mini"
#LLM_ENDPOINT="https://DNS.azure.com/openai/deployments/gpt-4o-mini"
#LLM_API_KEY="<<TALK TO YOUR AZURE GUY"
#LLM_API_VERSION="2024-12-01-preview"
## llm api version might not be relevant
#LLM_MAX_TOKENS="16384"
#EMBEDDING_MODEL="azure/text-embedding-3-large"
#EMBEDDING_ENDPOINT="https://DNS.openai.azure.com/openai/deployments/text-embedding-3-large"
#EMBEDDING_API_KEY="<<TALK TO YOUR AZURE GUY>"
#EMBEDDING_API_VERSION="2024-12-01-preview"
#EMBEDDING_DIMENSIONS=3072
#EMBEDDING_MAX_TOKENS=8191
########## Local LLM via Ollama ###############################################
#LLM_API_KEY ="ollama"
#LLM_MODEL="llama3.1:8b"
#LLM_PROVIDER="ollama"
#LLM_ENDPOINT="http://localhost:11434/v1"
#EMBEDDING_PROVIDER="ollama"
#EMBEDDING_MODEL="avr/sfr-embedding-mistral:latest"
#EMBEDDING_ENDPOINT="http://localhost:11434/api/embeddings"
#EMBEDDING_DIMENSIONS=4096
#HUGGINGFACE_TOKENIZER="Salesforce/SFR-Embedding-Mistral"
########## OpenRouter (also free) #########################################################
#LLM_API_KEY="<<go-get-one-yourself"
#LLM_PROVIDER="custom"
#LLM_MODEL="openrouter/google/gemini-2.0-flash-lite-preview-02-05:free"
#LLM_ENDPOINT="https://openrouter.ai/api/v1"
########## DeepInfra ##########################################################
#LLM_API_KEY="<<>>"
#LLM_PROVIDER="custom"
#LLM_MODEL="deepinfra/meta-llama/Meta-Llama-3-8B-Instruct"
#LLM_ENDPOINT="https://api.deepinfra.com/v1/openai"
#EMBEDDING_PROVIDER="openai"
#EMBEDDING_API_KEY="<<>>"
#EMBEDDING_MODEL="deepinfra/BAAI/bge-base-en-v1.5"
#EMBEDDING_ENDPOINT=""
#EMBEDDING_API_VERSION=""
#EMBEDDING_DIMENSIONS=3072
#EMBEDDING_MAX_TOKENS=8191
########## Release Test ###############################################
#LLM_API_KEY="..."
#OPENAI_API_KEY="..."
#MIGRATION_DB_PATH="~/Downloads/"
#MIGRATION_DB_NAME="Chinook_Sqlite.sqlite"
#MIGRATION_DB_PROVIDER="sqlite"
#GRAPH_DATABASE_URL="bolt://54.246.89.112:7687"
#GRAPH_DATABASE_USERNAME="neo4j"
#GRAPH_DATABASE_PASSWORD="pleaseletmein"

View file

@ -31,6 +31,7 @@ def get_add_router() -> APIRouter:
raise ValueError("Either datasetId or datasetName must be provided.")
try:
# TODO: Add check if HTTP Requests are enabled before allowing requests and git clone
if isinstance(data, str) and data.startswith("http"):
if "github" in data:
# Perform git clone if the URL is from GitHub

View file

@ -5,13 +5,16 @@ from sqlalchemy import select
from sqlalchemy.sql import delete as sql_delete
from cognee.modules.data.models import Data, DatasetData, Dataset
from cognee.infrastructure.databases.graph import get_graph_engine
from io import StringIO, BytesIO
from io import BytesIO
import hashlib
import asyncio
from uuid import UUID
from cognee.modules.users.models import User
from cognee.infrastructure.databases.vector import get_vector_engine
from cognee.infrastructure.engine import DataPoint
from cognee.modules.graph.utils.convert_node_to_data_point import get_all_subclasses
from cognee.modules.users.methods import get_default_user
from cognee.modules.data.methods import get_authorized_existing_datasets
from cognee.context_global_variables import set_database_global_context_variables
from .exceptions import DocumentNotFoundError, DatasetNotFoundError, DocumentSubgraphNotFoundError
from cognee.shared.logging_utils import get_logger
@ -26,7 +29,9 @@ def get_text_content_hash(text: str) -> str:
async def delete(
data: Union[BinaryIO, List[BinaryIO], str, List[str]],
dataset_name: str = "main_dataset",
dataset_id: UUID = None,
mode: str = "soft",
user: User = None,
):
"""Delete a document and all its related nodes from both relational and graph databases.
@ -34,15 +39,27 @@ async def delete(
data: The data to delete (file, URL, or text)
dataset_name: Name of the dataset to delete from
mode: "soft" (default) or "hard" - hard mode also deletes degree-one entity nodes
user: User doing the operation, if none default user will be used.
"""
if user is None:
user = await get_default_user()
# Verify user has permission to work with given dataset. If dataset_id is given use it, if not use dataset_name
dataset = await get_authorized_existing_datasets(
[dataset_id] if dataset_id else [dataset_name], "delete", user
)
# Will only be used if ENABLE_BACKEND_ACCESS_CONTROL is set to True
await set_database_global_context_variables(dataset[0].id, dataset[0].owner_id)
# Handle different input types
if isinstance(data, str):
if data.startswith("file://"): # It's a file path
with open(data.replace("file://", ""), mode="rb") as file:
classified_data = classify(file)
content_hash = classified_data.get_metadata()["content_hash"]
return await delete_single_document(content_hash, dataset_name, mode)
return await delete_single_document(content_hash, dataset[0].id, mode)
elif data.startswith("http"): # It's a URL
import requests
@ -51,26 +68,26 @@ async def delete(
file_data = BytesIO(response.content)
classified_data = classify(file_data)
content_hash = classified_data.get_metadata()["content_hash"]
return await delete_single_document(content_hash, dataset_name, mode)
return await delete_single_document(content_hash, dataset[0].id, mode)
else: # It's a text string
content_hash = get_text_content_hash(data)
classified_data = classify(data)
return await delete_single_document(content_hash, dataset_name, mode)
return await delete_single_document(content_hash, dataset[0].id, mode)
elif isinstance(data, list):
# Handle list of inputs sequentially
results = []
for item in data:
result = await delete(item, dataset_name, mode)
result = await delete(item, dataset_name, dataset[0].id, mode)
results.append(result)
return {"status": "success", "message": "Multiple documents deleted", "results": results}
else: # It's already a BinaryIO
data.seek(0) # Ensure we're at the start of the file
classified_data = classify(data)
content_hash = classified_data.get_metadata()["content_hash"]
return await delete_single_document(content_hash, dataset_name, mode)
return await delete_single_document(content_hash, dataset[0].id, mode)
async def delete_single_document(content_hash: str, dataset_name: str, mode: str = "soft"):
async def delete_single_document(content_hash: str, dataset_id: UUID = None, mode: str = "soft"):
"""Delete a single document by its content hash."""
# Delete from graph database
@ -157,11 +174,11 @@ async def delete_single_document(content_hash: str, dataset_name: str, mode: str
# Get the dataset
dataset = (
await session.execute(select(Dataset).filter(Dataset.name == dataset_name))
await session.execute(select(Dataset).filter(Dataset.id == dataset_id))
).scalar_one_or_none()
if dataset is None:
raise DatasetNotFoundError(f"Dataset not found: {dataset_name}")
raise DatasetNotFoundError(f"Dataset not found: {dataset_id}")
# Delete from dataset_data table
dataset_delete_stmt = sql_delete(DatasetData).where(
@ -186,7 +203,7 @@ async def delete_single_document(content_hash: str, dataset_name: str, mode: str
"message": "Document deleted from both graph and relational databases",
"graph_deletions": deletion_result["deleted_counts"],
"content_hash": content_hash,
"dataset": dataset_name,
"dataset": dataset_id,
"deleted_node_ids": [
str(node_id) for node_id in deleted_node_ids
], # Convert back to strings for response

View file

@ -1,7 +1,8 @@
from fastapi import Form, UploadFile, Depends
from fastapi.responses import JSONResponse
from fastapi import APIRouter
from typing import List, Optional
from typing import List
from uuid import UUID
import subprocess
from cognee.shared.logging_utils import get_logger
import requests
@ -18,6 +19,7 @@ def get_delete_router() -> APIRouter:
async def delete(
data: List[UploadFile],
dataset_name: str = Form("main_dataset"),
dataset_id: UUID = None,
mode: str = Form("soft"),
user: User = Depends(get_authenticated_user),
):
@ -35,6 +37,7 @@ def get_delete_router() -> APIRouter:
# Handle each file in the list
results = []
for file in data:
# TODO: Add check if HTTP Requests are enabled before allowing requests and git clone
if file.filename.startswith("http"):
if "github" in file.filename:
# For GitHub repos, we need to get the content hash of each file
@ -54,12 +57,22 @@ def get_delete_router() -> APIRouter:
response.raise_for_status()
file_data = response.content
result = await cognee_delete(
file_data, dataset_name=dataset_name, mode=mode
file_data,
dataset_name=dataset_name,
dataset_id=dataset_id,
mode=mode,
user=user,
)
results.append(result)
else:
# Handle uploaded file by accessing its file attribute
result = await cognee_delete(file.file, dataset_name=dataset_name, mode=mode)
result = await cognee_delete(
file.file,
dataset_name=dataset_name,
dataset_id=dataset_id,
mode=mode,
user=user,
)
results.append(result)
if len(results) == 1:

View file

@ -176,43 +176,6 @@ class DataPoint(BaseModel):
"""
return self.model_validate_json(json_str)
# Pickle Serialization
def to_pickle(self) -> bytes:
"""
Serialize the DataPoint instance to a byte format for pickling.
This method uses the built-in Python pickle module to convert the instance into a byte
stream for persistence or transmission.
Returns:
--------
- bytes: The pickled byte representation of the DataPoint instance.
"""
return pickle.dumps(self.dict())
@classmethod
def from_pickle(self, pickled_data: bytes):
"""
Deserialize a DataPoint instance from a pickled byte stream.
The method converts the byte stream back into a DataPoint instance by loading the data
and validating it through the model's constructor.
Parameters:
-----------
- pickled_data (bytes): The bytes representation of a pickled DataPoint instance to
be deserialized.
Returns:
--------
A new DataPoint instance created from the pickled data.
"""
data = pickle.loads(pickled_data)
return self(**data)
def to_dict(self, **kwargs) -> Dict[str, Any]:
"""
Convert the DataPoint instance to a dictionary representation.

View file

@ -20,6 +20,7 @@ async def save_data_item_to_storage(data_item: Union[BinaryIO, str, Any], datase
file_path = data_item
# data is a file path
elif data_item.startswith("file://") or data_item.startswith("/"):
# TODO: Add check if ACCEPT_LOCAL_FILE_PATH is enabled, if it's not raise an error
file_path = data_item.replace("file://", "")
# data is text
else: