cleanup of delete dataset methods

This commit is contained in:
rajeevrajeshuni 2025-11-30 07:26:24 +05:30
parent 3a05cca74b
commit 40846df92d
8 changed files with 72 additions and 125 deletions

View file

@ -6,7 +6,7 @@ from cognee.cli import DEFAULT_DOCS_URL
import cognee.cli.echo as fmt
from cognee.cli.exceptions import CliCommandException, CliCommandInnerException
from cognee.modules.data.methods.get_deletion_counts import get_deletion_counts
from cognee.modules.data.methods.delete_datasets_by_name import delete_datasets_by_name
from cognee.modules.data.methods.delete_dataset_by_name import delete_datasets_by_name
from cognee.modules.data.methods.delete_data_by_user import delete_data_by_user
from cognee.modules.users.methods import get_default_user

View file

@ -6,7 +6,7 @@ from textual.containers import Container, Vertical, Horizontal
from textual.binding import Binding
from cognee.cli.tui.base_screen import BaseTUIScreen
from cognee.modules.data.methods.get_deletion_counts import get_deletion_counts
from cognee.modules.data.methods.delete_datasets_by_name import delete_datasets_by_name
from cognee.modules.data.methods.delete_dataset_by_name import delete_datasets_by_name
from cognee.modules.data.methods.delete_data_by_user import delete_data_by_user
from cognee.modules.users.methods import get_default_user

View file

@ -3,8 +3,7 @@ import asyncio
from os import path
import tempfile
from uuid import UUID
from typing import Optional
from typing import AsyncGenerator, List
from typing import Optional, AsyncGenerator, List, Union
from contextlib import asynccontextmanager
from sqlalchemy.orm import joinedload
from sqlalchemy.exc import NoResultFound
@ -235,35 +234,45 @@ class SQLAlchemyAdapter:
return [schema[0] for schema in result.fetchall()]
return []
async def delete_entity_by_id(
self, table_name: str, data_id: UUID, schema_name: Optional[str] = "public"
async def delete_entities_by_id(
self,
table_name: str,
data_id: Union[UUID, List[UUID]], # Supports a single UUID or a List of UUIDs
schema_name: Optional[str] = "public"
):
"""
Delete an entity from the specified table based on its unique ID.
Delete one or more entities from the specified table based on their ID(s).
Parameters:
-----------
- table_name (str): The name of the table from which to delete the entity.
- data_id (UUID): The unique identifier of the entity to be deleted.
- schema_name (Optional[str]): The name of the schema where the table resides,
defaults to 'public'. (default 'public')
- table_name (str): The name of the table from which to delete the entities.
- data_id (Union[UUID, List[UUID]]): The unique identifier(s) to be deleted.
- schema_name (Optional[str]): The name of the schema where the table resides.
"""
if self.engine.dialect.name == "sqlite":
async with self.get_async_session() as session:
TableModel = await self.get_table(table_name, schema_name)
# Foreign key constraints are disabled by default in SQLite (for backwards compatibility),
# so must be enabled for each database connection/session separately.
# Ensure data_ids is a list for the WHERE clause logic
if isinstance(data_id, list):
data_ids_to_delete = data_id
else:
data_ids_to_delete = [data_id]
if not data_ids_to_delete:
return
async with self.get_async_session() as session:
TableModel = await self.get_table(table_name, schema_name)
# Handle SQLite's foreign key requirement
if self.engine.dialect.name == "sqlite":
from sqlalchemy import text
await session.execute(text("PRAGMA foreign_keys = ON;"))
await session.execute(TableModel.delete().where(TableModel.c.id == data_id))
await session.commit()
else:
async with self.get_async_session() as session:
TableModel = await self.get_table(table_name, schema_name)
await session.execute(TableModel.delete().where(TableModel.c.id == data_id))
await session.commit()
# Construct the DELETE statement using the 'in_()' operator
stmt = TableModel.delete().where(TableModel.c.id.in_(data_ids_to_delete))
# Execute and commit
await session.execute(stmt)
await session.commit()
async def delete_data_entity(self, data_id: UUID):
"""

View file

@ -16,7 +16,7 @@ from .get_dataset_ids import get_dataset_ids
# Delete
from .delete_dataset import delete_dataset
from .delete_datasets_by_name import delete_datasets_by_name
from .delete_dataset_by_name import delete_datasets_by_name
from .delete_data_by_user import delete_data_by_user
from .delete_data import delete_data

View file

@ -1,14 +1,14 @@
from uuid import UUID
from sqlalchemy import select, delete as sql_delete
from sqlalchemy import select
from cognee.infrastructure.databases.relational import get_relational_engine
from cognee.modules.data.models import Dataset, DatasetData
from cognee.modules.data.models import Dataset
from cognee.modules.users.methods import get_user
from cognee.shared.logging_utils import get_logger
logger = get_logger()
async def delete_data_by_user(user_id: UUID) -> dict[str, int]:
async def delete_data_by_user(user_id: UUID):
"""
Delete all datasets and their associated data for a specific user.
@ -18,11 +18,6 @@ async def delete_data_by_user(user_id: UUID) -> dict[str, int]:
Args:
user_id: UUID of the user whose data should be deleted
Returns:
Dictionary containing deletion statistics:
- datasets_deleted: Number of datasets deleted
- data_entries_deleted: Number of data entries deleted
Raises:
ValueError: If user is not found
"""
@ -35,37 +30,7 @@ async def delete_data_by_user(user_id: UUID) -> dict[str, int]:
raise ValueError(f"User with ID {user_id} not found")
# Get all datasets owned by this user
datasets_query = select(Dataset).where(Dataset.owner_id == user_id)
user_datasets = (await session.execute(datasets_query)).scalars().all()
datasets_deleted = 0
data_entries_deleted = 0
# Delete each dataset and its data
for dataset in user_datasets:
# Get all data entries in this dataset
data_query = select(DatasetData).where(DatasetData.dataset_id == dataset.id)
dataset_data_links = (await session.execute(data_query)).scalars().all()
# Delete dataset-data links
for link in dataset_data_links:
await session.execute(
sql_delete(DatasetData).where(DatasetData.id == link.id)
)
data_entries_deleted += 1
# Delete the dataset itself
await session.execute(
sql_delete(Dataset).where(Dataset.id == dataset.id)
)
datasets_deleted += 1
# Commit all changes
await session.commit()
logger.info(f"Deleted {datasets_deleted} datasets and {data_entries_deleted} data entries for user {user_id}")
return {
"datasets_deleted": datasets_deleted,
"data_entries_deleted": data_entries_deleted,
}
datasets_query = select(Dataset.id).where(Dataset.owner_id == user_id)
user_datasets_ids = (await session.execute(datasets_query)).scalars().all()
if user_datasets_ids:
await db_engine.delete_entities_by_id(Dataset.__table__.name, user_datasets_ids)

View file

@ -5,4 +5,4 @@ from cognee.infrastructure.databases.relational import get_relational_engine
async def delete_dataset(dataset: Dataset):
db_engine = get_relational_engine()
return await db_engine.delete_entity_by_id(dataset.__tablename__, dataset.id)
return await db_engine.delete_entities_by_id(dataset.__tablename__, dataset.id)

View file

@ -0,0 +1,30 @@
from uuid import UUID
from sqlalchemy import select
from cognee.infrastructure.databases.relational import get_relational_engine
from ..models import Dataset
async def delete_dataset_by_name(
dataset_name: str, user_id: UUID
):
"""
Delete a single dataset by name for a specific user.
Args:
dataset_name: The name of the dataset to delete (must be a single string).
user_id: UUID of the dataset owner.
"""
db_engine = get_relational_engine()
async with db_engine.get_async_session() as session:
dataset_id = (
await session.scalars(
select(Dataset.id)
.filter(Dataset.owner_id == user_id)
.filter(Dataset.name == dataset_name)
)
).first()
#Keeping this out of the first session, since delete_entities_by_id creates another session.
if dataset_id:
await db_engine.delete_entities_by_id(Dataset.__table__.name, dataset_id)

View file

@ -1,57 +0,0 @@
from typing import Union
from uuid import UUID
from sqlalchemy import select
from cognee.infrastructure.databases.relational import get_relational_engine
from ..models import Dataset
async def delete_datasets_by_name(
dataset_names: Union[str, list[str]], user_id: UUID
) -> dict[str, any]:
"""
Delete datasets by name for a specific user.
Args:
dataset_names: Single dataset name or list of dataset names to delete
user_id: UUID of the dataset owner
Returns:
Dictionary containing:
- deleted_count: Number of datasets deleted
- deleted_ids: List of deleted dataset IDs
- not_found: List of dataset names that were not found
"""
db_engine = get_relational_engine()
async with db_engine.get_async_session() as session:
# Normalize input to list
if isinstance(dataset_names, str):
dataset_names = [dataset_names]
# Retrieve datasets matching the names and user_id
datasets = (
await session.scalars(
select(Dataset)
.filter(Dataset.owner_id == user_id)
.filter(Dataset.name.in_(dataset_names))
)
).all()
# Track results
deleted_ids = []
found_names = set()
# Delete each dataset
for dataset in datasets:
await db_engine.delete_entity_by_id(dataset.__tablename__, dataset.id)
deleted_ids.append(dataset.id)
found_names.add(dataset.name)
# Identify datasets that were not found
not_found = [name for name in dataset_names if name not in found_names]
return {
"deleted_count": len(deleted_ids),
"deleted_ids": deleted_ids,
"not_found": not_found
}