From 1406b6f0781baa1dfab0b3348e36bc30d7368ee5 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Thu, 19 Dec 2024 17:30:02 +0100 Subject: [PATCH] test: Add test to verify deletion of local files Added test that checks local files created by cognee will be deleted and those not created by cognee won't Test COG-475 --- .../sqlalchemy/SqlAlchemyAdapter.py | 4 +- cognee/tests/test_pgvector.py | 53 +++++++++++++++---- 2 files changed, 45 insertions(+), 12 deletions(-) diff --git a/cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py b/cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py index 3b9a36e71..006f2ed98 100644 --- a/cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py +++ b/cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py @@ -128,7 +128,7 @@ class SQLAlchemyAdapter(): # Check if other data objects point to the same raw data location raw_data_location_entities = (await session.execute( - select(Data).where(Data.raw_data_location == data_entity.raw_data_location))).all() + select(Data.raw_data_location).where(Data.raw_data_location == data_entity.raw_data_location))).all() # Don't delete local file unless this is the only reference to the file in the database if len(raw_data_location_entities) == 1: @@ -136,7 +136,7 @@ class SQLAlchemyAdapter(): from cognee.base_config import get_base_config config = get_base_config() if config.data_root_directory in raw_data_location_entities[0].raw_data_location: - os.remove(raw_data_location_entities[0]) + os.remove(raw_data_location_entities[0].raw_data_location) await session.execute(delete(Data).where(Data.id == data_id)) await session.commit() diff --git a/cognee/tests/test_pgvector.py b/cognee/tests/test_pgvector.py index 9554a3f9d..9da34c3d4 100644 --- a/cognee/tests/test_pgvector.py +++ b/cognee/tests/test_pgvector.py @@ -2,12 +2,52 @@ import os import logging import pathlib import cognee + +from cognee.modules.data.models import Data from cognee.api.v1.search import SearchType from cognee.modules.retrieval.brute_force_triplet_search import brute_force_triplet_search from cognee.modules.users.methods import get_default_user logging.basicConfig(level=logging.DEBUG) +async def test_local_file_deletion(data_text, file_location): + from sqlalchemy import select + import hashlib + from cognee.infrastructure.databases.relational import get_relational_engine + + engine = get_relational_engine() + + async with engine.get_async_session() as session: + # Get hash of data contents + encoded_text = data_text.encode("utf-8") + data_hash = hashlib.md5(encoded_text).hexdigest() + # Get data entry from database based on hash contents + data = (await session.scalars(select(Data).where(Data.content_hash == data_hash))).one() + assert os.path.isfile(data.raw_data_location), f"Data location doesn't exist: {data.raw_data_location}" + # Test deletion of data along with local files created by cognee + await engine.delete_data_entity(data.id) + assert not os.path.exists(data.raw_data_location), f"Data location exists: {data.raw_data_location}" + + async with engine.get_async_session() as session: + # Get data entry from database based on file path + data = (await session.scalars(select(Data).where(Data.raw_data_location == file_location))).one() + assert os.path.isfile(data.raw_data_location), f"Data location doesn't exist: {data.raw_data_location}" + # Test local files not created by cognee won't get deleted + await engine.delete_data_entity(data.id) + assert os.path.exists(data.raw_data_location), f"Data location doesn't exists: {data.raw_data_location}" + +async def test_getting_of_documents(dataset_name_1): + # Test getting of documents for search per dataset + from cognee.modules.users.permissions.methods import get_document_ids_for_user + user = await get_default_user() + document_ids = await get_document_ids_for_user(user.id, [dataset_name_1]) + assert len(document_ids) == 1, f"Number of expected documents doesn't match {len(document_ids)} != 1" + + # Test getting of documents for search when no dataset is provided + user = await get_default_user() + document_ids = await get_document_ids_for_user(user.id) + assert len(document_ids) == 2, f"Number of expected documents doesn't match {len(document_ids)} != 2" + async def main(): cognee.config.set_vector_db_config( @@ -67,16 +107,7 @@ async def main(): from cognee.infrastructure.databases.vector import get_vector_engine - # Test getting of documents for search per dataset - from cognee.modules.users.permissions.methods import get_document_ids_for_user - user = await get_default_user() - document_ids = await get_document_ids_for_user(user.id, [dataset_name_1]) - assert len(document_ids) == 1, f"Number of expected documents doesn't match {len(document_ids)} != 1" - - # Test getting of documents for search when no dataset is provided - user = await get_default_user() - document_ids = await get_document_ids_for_user(user.id) - assert len(document_ids) == 2, f"Number of expected documents doesn't match {len(document_ids)} != 2" + await test_getting_of_documents(dataset_name_1) vector_engine = get_vector_engine() random_node = (await vector_engine.search("entity_name", "Quantum computer"))[0] @@ -106,6 +137,8 @@ async def main(): results = await brute_force_triplet_search('What is a quantum computer?') assert len(results) > 0 + await test_local_file_deletion(text, explanation_file_path) + await cognee.prune.prune_data() assert not os.path.isdir(data_directory_path), "Local data files are not deleted"