diff --git a/.env.template b/.env.template index fc2b2450a..f789fde4b 100644 --- a/.env.template +++ b/.env.template @@ -1,189 +1,112 @@ +############################################################################### +# NOTE: With default settings Cognee only needs an OpenAI LLM_API_KEY to be set. +# The rest of the settings don't have to be set. +# Default relational database: SQLite +# Default vector database : LanceDB +# Default graph database : Kuzu +# +# These default databases are all file-based, so no extra setup is needed +# for local use. +############################################################################### -### -### DEV -### - - -TOKENIZERS_PARALLELISM="false" - -### -### LLM -### - -### -### simple, "expensive", an OpenAPI key -### +################################################################################ +# 🧠 LLM Settings +################################################################################ LLM_API_KEY="your_api_key" - -### -### DEV LLM, cheap with content filters -### - -LLM_MODEL="azure/gpt-4o-mini" -LLM_ENDPOINT="https://DNS.azure.com/openai/deployments/gpt-4o-mini" -LLM_API_KEY="< APIRouter: raise ValueError("Either datasetId or datasetName must be provided.") try: + # TODO: Add check if HTTP Requests are enabled before allowing requests and git clone if isinstance(data, str) and data.startswith("http"): if "github" in data: # Perform git clone if the URL is from GitHub diff --git a/cognee/api/v1/delete/delete.py b/cognee/api/v1/delete/delete.py index 1b5a64fc0..363acff3e 100644 --- a/cognee/api/v1/delete/delete.py +++ b/cognee/api/v1/delete/delete.py @@ -5,13 +5,16 @@ from sqlalchemy import select from sqlalchemy.sql import delete as sql_delete from cognee.modules.data.models import Data, DatasetData, Dataset from cognee.infrastructure.databases.graph import get_graph_engine -from io import StringIO, BytesIO +from io import BytesIO import hashlib -import asyncio from uuid import UUID +from cognee.modules.users.models import User from cognee.infrastructure.databases.vector import get_vector_engine from cognee.infrastructure.engine import DataPoint from cognee.modules.graph.utils.convert_node_to_data_point import get_all_subclasses +from cognee.modules.users.methods import get_default_user +from cognee.modules.data.methods import get_authorized_existing_datasets +from cognee.context_global_variables import set_database_global_context_variables from .exceptions import DocumentNotFoundError, DatasetNotFoundError, DocumentSubgraphNotFoundError from cognee.shared.logging_utils import get_logger @@ -26,7 +29,9 @@ def get_text_content_hash(text: str) -> str: async def delete( data: Union[BinaryIO, List[BinaryIO], str, List[str]], dataset_name: str = "main_dataset", + dataset_id: UUID = None, mode: str = "soft", + user: User = None, ): """Delete a document and all its related nodes from both relational and graph databases. @@ -34,15 +39,27 @@ async def delete( data: The data to delete (file, URL, or text) dataset_name: Name of the dataset to delete from mode: "soft" (default) or "hard" - hard mode also deletes degree-one entity nodes + user: User doing the operation, if none default user will be used. """ + if user is None: + user = await get_default_user() + + # Verify user has permission to work with given dataset. If dataset_id is given use it, if not use dataset_name + dataset = await get_authorized_existing_datasets( + [dataset_id] if dataset_id else [dataset_name], "delete", user + ) + + # Will only be used if ENABLE_BACKEND_ACCESS_CONTROL is set to True + await set_database_global_context_variables(dataset[0].id, dataset[0].owner_id) + # Handle different input types if isinstance(data, str): if data.startswith("file://"): # It's a file path with open(data.replace("file://", ""), mode="rb") as file: classified_data = classify(file) content_hash = classified_data.get_metadata()["content_hash"] - return await delete_single_document(content_hash, dataset_name, mode) + return await delete_single_document(content_hash, dataset[0].id, mode) elif data.startswith("http"): # It's a URL import requests @@ -51,26 +68,26 @@ async def delete( file_data = BytesIO(response.content) classified_data = classify(file_data) content_hash = classified_data.get_metadata()["content_hash"] - return await delete_single_document(content_hash, dataset_name, mode) + return await delete_single_document(content_hash, dataset[0].id, mode) else: # It's a text string content_hash = get_text_content_hash(data) classified_data = classify(data) - return await delete_single_document(content_hash, dataset_name, mode) + return await delete_single_document(content_hash, dataset[0].id, mode) elif isinstance(data, list): # Handle list of inputs sequentially results = [] for item in data: - result = await delete(item, dataset_name, mode) + result = await delete(item, dataset_name, dataset[0].id, mode) results.append(result) return {"status": "success", "message": "Multiple documents deleted", "results": results} else: # It's already a BinaryIO data.seek(0) # Ensure we're at the start of the file classified_data = classify(data) content_hash = classified_data.get_metadata()["content_hash"] - return await delete_single_document(content_hash, dataset_name, mode) + return await delete_single_document(content_hash, dataset[0].id, mode) -async def delete_single_document(content_hash: str, dataset_name: str, mode: str = "soft"): +async def delete_single_document(content_hash: str, dataset_id: UUID = None, mode: str = "soft"): """Delete a single document by its content hash.""" # Delete from graph database @@ -157,11 +174,11 @@ async def delete_single_document(content_hash: str, dataset_name: str, mode: str # Get the dataset dataset = ( - await session.execute(select(Dataset).filter(Dataset.name == dataset_name)) + await session.execute(select(Dataset).filter(Dataset.id == dataset_id)) ).scalar_one_or_none() if dataset is None: - raise DatasetNotFoundError(f"Dataset not found: {dataset_name}") + raise DatasetNotFoundError(f"Dataset not found: {dataset_id}") # Delete from dataset_data table dataset_delete_stmt = sql_delete(DatasetData).where( @@ -186,7 +203,7 @@ async def delete_single_document(content_hash: str, dataset_name: str, mode: str "message": "Document deleted from both graph and relational databases", "graph_deletions": deletion_result["deleted_counts"], "content_hash": content_hash, - "dataset": dataset_name, + "dataset": dataset_id, "deleted_node_ids": [ str(node_id) for node_id in deleted_node_ids ], # Convert back to strings for response diff --git a/cognee/api/v1/delete/routers/get_delete_router.py b/cognee/api/v1/delete/routers/get_delete_router.py index f9af61dff..3684caefa 100644 --- a/cognee/api/v1/delete/routers/get_delete_router.py +++ b/cognee/api/v1/delete/routers/get_delete_router.py @@ -1,7 +1,8 @@ from fastapi import Form, UploadFile, Depends from fastapi.responses import JSONResponse from fastapi import APIRouter -from typing import List, Optional +from typing import List +from uuid import UUID import subprocess from cognee.shared.logging_utils import get_logger import requests @@ -18,6 +19,7 @@ def get_delete_router() -> APIRouter: async def delete( data: List[UploadFile], dataset_name: str = Form("main_dataset"), + dataset_id: UUID = None, mode: str = Form("soft"), user: User = Depends(get_authenticated_user), ): @@ -35,6 +37,7 @@ def get_delete_router() -> APIRouter: # Handle each file in the list results = [] for file in data: + # TODO: Add check if HTTP Requests are enabled before allowing requests and git clone if file.filename.startswith("http"): if "github" in file.filename: # For GitHub repos, we need to get the content hash of each file @@ -54,12 +57,22 @@ def get_delete_router() -> APIRouter: response.raise_for_status() file_data = response.content result = await cognee_delete( - file_data, dataset_name=dataset_name, mode=mode + file_data, + dataset_name=dataset_name, + dataset_id=dataset_id, + mode=mode, + user=user, ) results.append(result) else: # Handle uploaded file by accessing its file attribute - result = await cognee_delete(file.file, dataset_name=dataset_name, mode=mode) + result = await cognee_delete( + file.file, + dataset_name=dataset_name, + dataset_id=dataset_id, + mode=mode, + user=user, + ) results.append(result) if len(results) == 1: diff --git a/cognee/infrastructure/engine/models/DataPoint.py b/cognee/infrastructure/engine/models/DataPoint.py index 2ab3e1752..3b5f7424a 100644 --- a/cognee/infrastructure/engine/models/DataPoint.py +++ b/cognee/infrastructure/engine/models/DataPoint.py @@ -176,43 +176,6 @@ class DataPoint(BaseModel): """ return self.model_validate_json(json_str) - # Pickle Serialization - def to_pickle(self) -> bytes: - """ - Serialize the DataPoint instance to a byte format for pickling. - - This method uses the built-in Python pickle module to convert the instance into a byte - stream for persistence or transmission. - - Returns: - -------- - - - bytes: The pickled byte representation of the DataPoint instance. - """ - return pickle.dumps(self.dict()) - - @classmethod - def from_pickle(self, pickled_data: bytes): - """ - Deserialize a DataPoint instance from a pickled byte stream. - - The method converts the byte stream back into a DataPoint instance by loading the data - and validating it through the model's constructor. - - Parameters: - ----------- - - - pickled_data (bytes): The bytes representation of a pickled DataPoint instance to - be deserialized. - - Returns: - -------- - - A new DataPoint instance created from the pickled data. - """ - data = pickle.loads(pickled_data) - return self(**data) - def to_dict(self, **kwargs) -> Dict[str, Any]: """ Convert the DataPoint instance to a dictionary representation. diff --git a/cognee/tasks/ingestion/save_data_item_to_storage.py b/cognee/tasks/ingestion/save_data_item_to_storage.py index 976c1dd26..fefd8728b 100644 --- a/cognee/tasks/ingestion/save_data_item_to_storage.py +++ b/cognee/tasks/ingestion/save_data_item_to_storage.py @@ -20,6 +20,7 @@ async def save_data_item_to_storage(data_item: Union[BinaryIO, str, Any], datase file_path = data_item # data is a file path elif data_item.startswith("file://") or data_item.startswith("/"): + # TODO: Add check if ACCEPT_LOCAL_FILE_PATH is enabled, if it's not raise an error file_path = data_item.replace("file://", "") # data is text else: