From f6800b979edad66102ac449e00889b709a2a4e33 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Wed, 18 Dec 2024 15:26:13 +0100 Subject: [PATCH 1/8] feat: Add deletion of local files when deleting data Delete local files when deleting data from cognee Feature COG-475 --- .../sqlalchemy/SqlAlchemyAdapter.py | 28 +++++++++++++++++-- cognee/modules/data/methods/delete_data.py | 2 +- cognee/modules/data/methods/delete_dataset.py | 2 +- 3 files changed, 27 insertions(+), 5 deletions(-) diff --git a/cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py b/cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py index 8041aeaea..e0db40ca3 100644 --- a/cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py +++ b/cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py @@ -3,11 +3,12 @@ from uuid import UUID from typing import Optional from typing import AsyncGenerator, List from contextlib import asynccontextmanager -from sqlalchemy import text, select, MetaData, Table +from sqlalchemy import text, select, MetaData, Table, delete from sqlalchemy.orm import joinedload from sqlalchemy.ext.asyncio import AsyncSession, create_async_engine, async_sessionmaker from cognee.infrastructure.databases.exceptions import EntityNotFoundError +from cognee.modules.data.models.Data import Data from ..ModelBase import Base class SQLAlchemyAdapter(): @@ -86,9 +87,9 @@ class SQLAlchemyAdapter(): return [schema[0] for schema in result.fetchall()] return [] - async def delete_data_by_id(self, table_name: str, data_id: UUID, schema_name: Optional[str] = "public"): + async def delete_entity_by_id(self, table_name: str, data_id: UUID, schema_name: Optional[str] = "public"): """ - Delete data in given table based on id. Table must have an id Column. + Delete entity in given table based on id. Table must have an id Column. """ if self.engine.dialect.name == "sqlite": async with self.get_async_session() as session: @@ -107,6 +108,27 @@ class SQLAlchemyAdapter(): await session.commit() + async def delete_data_entity(self, data_id: UUID): + """ + Delete data and local files related to data if there are no references to it anymore. + """ + if self.engine.dialect.name == "sqlite": + async with self.get_async_session() as session: + + # Foreign key constraints are disabled by default in SQLite (for backwards compatibility), + # so must be enabled for each database connection/session separately. + await session.execute(text("PRAGMA foreign_keys = ON;")) + + data_entity = await session.execute(select(Data).where(Data.id == data_id)) + + await session.execute(delete(Data).where(Data.id == data_id)) + await session.commit() + else: + async with self.get_async_session() as session: + await session.execute(delete(Data).where(Data.id == data_id)) + await session.commit() + + async def get_table(self, table_name: str, schema_name: Optional[str] = "public") -> Table: """ Dynamically loads a table using the given table name and schema name. diff --git a/cognee/modules/data/methods/delete_data.py b/cognee/modules/data/methods/delete_data.py index c0493a606..f0e9629ec 100644 --- a/cognee/modules/data/methods/delete_data.py +++ b/cognee/modules/data/methods/delete_data.py @@ -17,4 +17,4 @@ async def delete_data(data: Data): db_engine = get_relational_engine() - return await db_engine.delete_data_by_id(data.__tablename__, data.id) + return await db_engine.delete_entity_by_id(data.__tablename__, data.id) diff --git a/cognee/modules/data/methods/delete_dataset.py b/cognee/modules/data/methods/delete_dataset.py index c2205144d..96a2e7d71 100644 --- a/cognee/modules/data/methods/delete_dataset.py +++ b/cognee/modules/data/methods/delete_dataset.py @@ -4,4 +4,4 @@ from cognee.infrastructure.databases.relational import get_relational_engine async def delete_dataset(dataset: Dataset): db_engine = get_relational_engine() - return await db_engine.delete_data_by_id(dataset.__tablename__, dataset.id) + return await db_engine.delete_entity_by_id(dataset.__tablename__, dataset.id) From ee6bc1703249e2c6bc3e95ae2de7ffe9087633af Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Wed, 18 Dec 2024 17:10:08 +0100 Subject: [PATCH 2/8] refacotr: Add current development status Save current development status Refactor --- .../sqlalchemy/SqlAlchemyAdapter.py | 29 +++- cognee/tests/test_pgvector.py | 137 +++++++++--------- 2 files changed, 99 insertions(+), 67 deletions(-) diff --git a/cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py b/cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py index e0db40ca3..7eb4ccc2f 100644 --- a/cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py +++ b/cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py @@ -5,10 +5,12 @@ from typing import AsyncGenerator, List from contextlib import asynccontextmanager from sqlalchemy import text, select, MetaData, Table, delete from sqlalchemy.orm import joinedload +from sqlalchemy.exc import NoResultFound from sqlalchemy.ext.asyncio import AsyncSession, create_async_engine, async_sessionmaker from cognee.infrastructure.databases.exceptions import EntityNotFoundError from cognee.modules.data.models.Data import Data + from ..ModelBase import Base class SQLAlchemyAdapter(): @@ -119,12 +121,37 @@ class SQLAlchemyAdapter(): # so must be enabled for each database connection/session separately. await session.execute(text("PRAGMA foreign_keys = ON;")) - data_entity = await session.execute(select(Data).where(Data.id == data_id)) + data_entity = await session.scalars(select(Data).where(Data.id == data_id)).one() + + # Check if other data objects point to the same raw data location + raw_data_location_entities= await session.execute( + select(Data).where(Data.raw_data_location == data_entity.raw_data_location)).all() + + # Don't delete local file unless this is the only reference to the file in the database + if len(raw_data_location_entities) == 1: + # delete local file + from cognee.base_config import get_base_config + config get_base_config() + await session.execute(delete(Data).where(Data.id == data_id)) await session.commit() else: async with self.get_async_session() as session: + try: + data_entity = (await session.scalars(select(Data).where(Data.id == data_id))).one() + except (ValueError, NoResultFound) as e: + raise EntityNotFoundError(message=f"Entity not found: {str(e)}") + + # Check if other data objects point to the same raw data location + raw_data_location_entities = (await session.execute( + select(Data).where(Data.raw_data_location == data_entity.raw_data_location))).all() + + # Don't delete local file unless this is the only reference to the file in the database + if len(raw_data_location_entities) == 1: + # delete local file + pass + await session.execute(delete(Data).where(Data.id == data_id)) await session.commit() diff --git a/cognee/tests/test_pgvector.py b/cognee/tests/test_pgvector.py index 9554a3f9d..d0326ac91 100644 --- a/cognee/tests/test_pgvector.py +++ b/cognee/tests/test_pgvector.py @@ -42,76 +42,81 @@ async def main(): ) cognee.config.system_root_directory(cognee_directory_path) - await cognee.prune.prune_data() - await cognee.prune.prune_system(metadata = True) + from cognee.infrastructure.databases.relational.get_relational_engine import get_relational_engine - dataset_name_1 = "natural_language" - dataset_name_2 = "quantum" + engine = get_relational_engine() + await engine.delete_data_entity("2cba57c3-d7ec-5746-b819-d89f87f05c18") - explanation_file_path = os.path.join( - pathlib.Path(__file__).parent, "test_data/Natural_language_processing.txt" - ) - await cognee.add([explanation_file_path], dataset_name_1) + # await cognee.prune.prune_data() + # await cognee.prune.prune_system(metadata = True) - text = """A quantum computer is a computer that takes advantage of quantum mechanical phenomena. - At small scales, physical matter exhibits properties of both particles and waves, and quantum computing leverages this behavior, specifically quantum superposition and entanglement, using specialized hardware that supports the preparation and manipulation of quantum states. - Classical physics cannot explain the operation of these quantum devices, and a scalable quantum computer could perform some calculations exponentially faster (with respect to input size scaling) than any modern "classical" computer. In particular, a large-scale quantum computer could break widely used encryption schemes and aid physicists in performing physical simulations; however, the current state of the technology is largely experimental and impractical, with several obstacles to useful applications. Moreover, scalable quantum computers do not hold promise for many practical tasks, and for many important tasks quantum speedups are proven impossible. - The basic unit of information in quantum computing is the qubit, similar to the bit in traditional digital electronics. Unlike a classical bit, a qubit can exist in a superposition of its two "basis" states. When measuring a qubit, the result is a probabilistic output of a classical bit, therefore making quantum computers nondeterministic in general. If a quantum computer manipulates the qubit in a particular way, wave interference effects can amplify the desired measurement results. The design of quantum algorithms involves creating procedures that allow a quantum computer to perform calculations efficiently and quickly. - Physically engineering high-quality qubits has proven challenging. If a physical qubit is not sufficiently isolated from its environment, it suffers from quantum decoherence, introducing noise into calculations. Paradoxically, perfectly isolating qubits is also undesirable because quantum computations typically need to initialize qubits, perform controlled qubit interactions, and measure the resulting quantum states. Each of those operations introduces errors and suffers from noise, and such inaccuracies accumulate. - In principle, a non-quantum (classical) computer can solve the same computational problems as a quantum computer, given enough time. Quantum advantage comes in the form of time complexity rather than computability, and quantum complexity theory shows that some quantum algorithms for carefully selected tasks require exponentially fewer computational steps than the best known non-quantum algorithms. Such tasks can in theory be solved on a large-scale quantum computer whereas classical computers would not finish computations in any reasonable amount of time. However, quantum speedup is not universal or even typical across computational tasks, since basic tasks such as sorting are proven to not allow any asymptotic quantum speedup. Claims of quantum supremacy have drawn significant attention to the discipline, but are demonstrated on contrived tasks, while near-term practical use cases remain limited. - """ + # dataset_name_1 = "natural_language" + # dataset_name_2 = "quantum" + # + # explanation_file_path = os.path.join( + # pathlib.Path(__file__).parent, "test_data/Natural_language_processing.txt" + # ) + # await cognee.add([explanation_file_path], dataset_name_1) + # + # text = """A quantum computer is a computer that takes advantage of quantum mechanical phenomena. + # At small scales, physical matter exhibits properties of both particles and waves, and quantum computing leverages this behavior, specifically quantum superposition and entanglement, using specialized hardware that supports the preparation and manipulation of quantum states. + # Classical physics cannot explain the operation of these quantum devices, and a scalable quantum computer could perform some calculations exponentially faster (with respect to input size scaling) than any modern "classical" computer. In particular, a large-scale quantum computer could break widely used encryption schemes and aid physicists in performing physical simulations; however, the current state of the technology is largely experimental and impractical, with several obstacles to useful applications. Moreover, scalable quantum computers do not hold promise for many practical tasks, and for many important tasks quantum speedups are proven impossible. + # The basic unit of information in quantum computing is the qubit, similar to the bit in traditional digital electronics. Unlike a classical bit, a qubit can exist in a superposition of its two "basis" states. When measuring a qubit, the result is a probabilistic output of a classical bit, therefore making quantum computers nondeterministic in general. If a quantum computer manipulates the qubit in a particular way, wave interference effects can amplify the desired measurement results. The design of quantum algorithms involves creating procedures that allow a quantum computer to perform calculations efficiently and quickly. + # Physically engineering high-quality qubits has proven challenging. If a physical qubit is not sufficiently isolated from its environment, it suffers from quantum decoherence, introducing noise into calculations. Paradoxically, perfectly isolating qubits is also undesirable because quantum computations typically need to initialize qubits, perform controlled qubit interactions, and measure the resulting quantum states. Each of those operations introduces errors and suffers from noise, and such inaccuracies accumulate. + # In principle, a non-quantum (classical) computer can solve the same computational problems as a quantum computer, given enough time. Quantum advantage comes in the form of time complexity rather than computability, and quantum complexity theory shows that some quantum algorithms for carefully selected tasks require exponentially fewer computational steps than the best known non-quantum algorithms. Such tasks can in theory be solved on a large-scale quantum computer whereas classical computers would not finish computations in any reasonable amount of time. However, quantum speedup is not universal or even typical across computational tasks, since basic tasks such as sorting are proven to not allow any asymptotic quantum speedup. Claims of quantum supremacy have drawn significant attention to the discipline, but are demonstrated on contrived tasks, while near-term practical use cases remain limited. + # """ + # + # await cognee.add([text], dataset_name_2) + # + # await cognee.cognify([dataset_name_2, dataset_name_1]) + # + # from cognee.infrastructure.databases.vector import get_vector_engine + # + # # Test getting of documents for search per dataset + # from cognee.modules.users.permissions.methods import get_document_ids_for_user + # user = await get_default_user() + # document_ids = await get_document_ids_for_user(user.id, [dataset_name_1]) + # assert len(document_ids) == 1, f"Number of expected documents doesn't match {len(document_ids)} != 1" + # + # # Test getting of documents for search when no dataset is provided + # user = await get_default_user() + # document_ids = await get_document_ids_for_user(user.id) + # assert len(document_ids) == 2, f"Number of expected documents doesn't match {len(document_ids)} != 2" + # + # vector_engine = get_vector_engine() + # random_node = (await vector_engine.search("entity_name", "Quantum computer"))[0] + # random_node_name = random_node.payload["text"] + # + # search_results = await cognee.search(SearchType.INSIGHTS, query_text = random_node_name) + # assert len(search_results) != 0, "The search results list is empty." + # print("\n\nExtracted sentences are:\n") + # for result in search_results: + # print(f"{result}\n") + # + # search_results = await cognee.search(SearchType.CHUNKS, query_text = random_node_name, datasets=[dataset_name_2]) + # assert len(search_results) != 0, "The search results list is empty." + # print("\n\nExtracted chunks are:\n") + # for result in search_results: + # print(f"{result}\n") + # + # search_results = await cognee.search(SearchType.SUMMARIES, query_text = random_node_name) + # assert len(search_results) != 0, "Query related summaries don't exist." + # print("\n\nExtracted summaries are:\n") + # for result in search_results: + # print(f"{result}\n") + # + # history = await cognee.get_search_history() + # assert len(history) == 6, "Search history is not correct." + # + # results = await brute_force_triplet_search('What is a quantum computer?') + # assert len(results) > 0 - await cognee.add([text], dataset_name_2) - - await cognee.cognify([dataset_name_2, dataset_name_1]) - - from cognee.infrastructure.databases.vector import get_vector_engine - - # Test getting of documents for search per dataset - from cognee.modules.users.permissions.methods import get_document_ids_for_user - user = await get_default_user() - document_ids = await get_document_ids_for_user(user.id, [dataset_name_1]) - assert len(document_ids) == 1, f"Number of expected documents doesn't match {len(document_ids)} != 1" - - # Test getting of documents for search when no dataset is provided - user = await get_default_user() - document_ids = await get_document_ids_for_user(user.id) - assert len(document_ids) == 2, f"Number of expected documents doesn't match {len(document_ids)} != 2" - - vector_engine = get_vector_engine() - random_node = (await vector_engine.search("entity_name", "Quantum computer"))[0] - random_node_name = random_node.payload["text"] - - search_results = await cognee.search(SearchType.INSIGHTS, query_text = random_node_name) - assert len(search_results) != 0, "The search results list is empty." - print("\n\nExtracted sentences are:\n") - for result in search_results: - print(f"{result}\n") - - search_results = await cognee.search(SearchType.CHUNKS, query_text = random_node_name, datasets=[dataset_name_2]) - assert len(search_results) != 0, "The search results list is empty." - print("\n\nExtracted chunks are:\n") - for result in search_results: - print(f"{result}\n") - - search_results = await cognee.search(SearchType.SUMMARIES, query_text = random_node_name) - assert len(search_results) != 0, "Query related summaries don't exist." - print("\n\nExtracted summaries are:\n") - for result in search_results: - print(f"{result}\n") - - history = await cognee.get_search_history() - assert len(history) == 6, "Search history is not correct." - - results = await brute_force_triplet_search('What is a quantum computer?') - assert len(results) > 0 - - await cognee.prune.prune_data() - assert not os.path.isdir(data_directory_path), "Local data files are not deleted" - - await cognee.prune.prune_system(metadata=True) - tables_in_database = await vector_engine.get_table_names() - assert len(tables_in_database) == 0, "PostgreSQL database is not empty" + # await cognee.prune.prune_data() + # assert not os.path.isdir(data_directory_path), "Local data files are not deleted" + # + # await cognee.prune.prune_system(metadata=True) + # tables_in_database = await vector_engine.get_table_names() + # assert len(tables_in_database) == 0, "PostgreSQL database is not empty" if __name__ == "__main__": import asyncio From d9368c6398b97366615dd50809c0607a1ccc7193 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Thu, 19 Dec 2024 15:15:13 +0100 Subject: [PATCH 3/8] chore: Add temporary state of development Add temp development state to branch Chore --- .../relational/sqlalchemy/SqlAlchemyAdapter.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py b/cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py index 7eb4ccc2f..c06aa833d 100644 --- a/cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py +++ b/cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py @@ -131,7 +131,10 @@ class SQLAlchemyAdapter(): if len(raw_data_location_entities) == 1: # delete local file from cognee.base_config import get_base_config - config get_base_config() + config = get_base_config() + if config.data_root_directory in raw_data_location_entities[0]: + # delete local file + pass await session.execute(delete(Data).where(Data.id == data_id)) @@ -150,7 +153,11 @@ class SQLAlchemyAdapter(): # Don't delete local file unless this is the only reference to the file in the database if len(raw_data_location_entities) == 1: # delete local file - pass + from cognee.base_config import get_base_config + config = get_base_config() + if config.data_root_directory in raw_data_location_entities[0].raw_data_location: + # delete local file + pass await session.execute(delete(Data).where(Data.id == data_id)) await session.commit() From c139d52938542fee46ebfda078ed181e06764e5f Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Thu, 19 Dec 2024 16:35:35 +0100 Subject: [PATCH 4/8] feat: Add deletion of local files made by cognee through data endpoint Delete local files made by cognee when deleting data from database through endpoint Feature COG-475 --- .../sqlalchemy/SqlAlchemyAdapter.py | 60 ++++++------------- cognee/modules/data/methods/delete_data.py | 2 +- 2 files changed, 20 insertions(+), 42 deletions(-) diff --git a/cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py b/cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py index c06aa833d..3b9a36e71 100644 --- a/cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py +++ b/cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py @@ -1,3 +1,4 @@ +import os from os import path from uuid import UUID from typing import Optional @@ -114,54 +115,31 @@ class SQLAlchemyAdapter(): """ Delete data and local files related to data if there are no references to it anymore. """ - if self.engine.dialect.name == "sqlite": - async with self.get_async_session() as session: - + async with self.get_async_session() as session: + if self.engine.dialect.name == "sqlite": # Foreign key constraints are disabled by default in SQLite (for backwards compatibility), # so must be enabled for each database connection/session separately. await session.execute(text("PRAGMA foreign_keys = ON;")) - data_entity = await session.scalars(select(Data).where(Data.id == data_id)).one() + try: + data_entity = (await session.scalars(select(Data).where(Data.id == data_id))).one() + except (ValueError, NoResultFound) as e: + raise EntityNotFoundError(message=f"Entity not found: {str(e)}") - # Check if other data objects point to the same raw data location - raw_data_location_entities= await session.execute( - select(Data).where(Data.raw_data_location == data_entity.raw_data_location)).all() + # Check if other data objects point to the same raw data location + raw_data_location_entities = (await session.execute( + select(Data).where(Data.raw_data_location == data_entity.raw_data_location))).all() - # Don't delete local file unless this is the only reference to the file in the database - if len(raw_data_location_entities) == 1: - # delete local file - from cognee.base_config import get_base_config - config = get_base_config() - if config.data_root_directory in raw_data_location_entities[0]: - # delete local file - pass - - - await session.execute(delete(Data).where(Data.id == data_id)) - await session.commit() - else: - async with self.get_async_session() as session: - try: - data_entity = (await session.scalars(select(Data).where(Data.id == data_id))).one() - except (ValueError, NoResultFound) as e: - raise EntityNotFoundError(message=f"Entity not found: {str(e)}") - - # Check if other data objects point to the same raw data location - raw_data_location_entities = (await session.execute( - select(Data).where(Data.raw_data_location == data_entity.raw_data_location))).all() - - # Don't delete local file unless this is the only reference to the file in the database - if len(raw_data_location_entities) == 1: - # delete local file - from cognee.base_config import get_base_config - config = get_base_config() - if config.data_root_directory in raw_data_location_entities[0].raw_data_location: - # delete local file - pass - - await session.execute(delete(Data).where(Data.id == data_id)) - await session.commit() + # Don't delete local file unless this is the only reference to the file in the database + if len(raw_data_location_entities) == 1: + # delete local file only if it's created by cognee + from cognee.base_config import get_base_config + config = get_base_config() + if config.data_root_directory in raw_data_location_entities[0].raw_data_location: + os.remove(raw_data_location_entities[0]) + await session.execute(delete(Data).where(Data.id == data_id)) + await session.commit() async def get_table(self, table_name: str, schema_name: Optional[str] = "public") -> Table: """ diff --git a/cognee/modules/data/methods/delete_data.py b/cognee/modules/data/methods/delete_data.py index f0e9629ec..65abe714a 100644 --- a/cognee/modules/data/methods/delete_data.py +++ b/cognee/modules/data/methods/delete_data.py @@ -17,4 +17,4 @@ async def delete_data(data: Data): db_engine = get_relational_engine() - return await db_engine.delete_entity_by_id(data.__tablename__, data.id) + return await db_engine.delete_data_entity(data.id) From 418cb65b358789984fe8fe150c4c728542a19640 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Thu, 19 Dec 2024 16:38:50 +0100 Subject: [PATCH 5/8] test: Revert changes on test_pgvector Revert changes on test_pgvector which were made to test deletion of local files Test COG-475 --- cognee/tests/test_pgvector.py | 137 ++++++++++++++++------------------ 1 file changed, 66 insertions(+), 71 deletions(-) diff --git a/cognee/tests/test_pgvector.py b/cognee/tests/test_pgvector.py index d0326ac91..9554a3f9d 100644 --- a/cognee/tests/test_pgvector.py +++ b/cognee/tests/test_pgvector.py @@ -42,81 +42,76 @@ async def main(): ) cognee.config.system_root_directory(cognee_directory_path) - from cognee.infrastructure.databases.relational.get_relational_engine import get_relational_engine + await cognee.prune.prune_data() + await cognee.prune.prune_system(metadata = True) - engine = get_relational_engine() - await engine.delete_data_entity("2cba57c3-d7ec-5746-b819-d89f87f05c18") + dataset_name_1 = "natural_language" + dataset_name_2 = "quantum" - # await cognee.prune.prune_data() - # await cognee.prune.prune_system(metadata = True) + explanation_file_path = os.path.join( + pathlib.Path(__file__).parent, "test_data/Natural_language_processing.txt" + ) + await cognee.add([explanation_file_path], dataset_name_1) - # dataset_name_1 = "natural_language" - # dataset_name_2 = "quantum" - # - # explanation_file_path = os.path.join( - # pathlib.Path(__file__).parent, "test_data/Natural_language_processing.txt" - # ) - # await cognee.add([explanation_file_path], dataset_name_1) - # - # text = """A quantum computer is a computer that takes advantage of quantum mechanical phenomena. - # At small scales, physical matter exhibits properties of both particles and waves, and quantum computing leverages this behavior, specifically quantum superposition and entanglement, using specialized hardware that supports the preparation and manipulation of quantum states. - # Classical physics cannot explain the operation of these quantum devices, and a scalable quantum computer could perform some calculations exponentially faster (with respect to input size scaling) than any modern "classical" computer. In particular, a large-scale quantum computer could break widely used encryption schemes and aid physicists in performing physical simulations; however, the current state of the technology is largely experimental and impractical, with several obstacles to useful applications. Moreover, scalable quantum computers do not hold promise for many practical tasks, and for many important tasks quantum speedups are proven impossible. - # The basic unit of information in quantum computing is the qubit, similar to the bit in traditional digital electronics. Unlike a classical bit, a qubit can exist in a superposition of its two "basis" states. When measuring a qubit, the result is a probabilistic output of a classical bit, therefore making quantum computers nondeterministic in general. If a quantum computer manipulates the qubit in a particular way, wave interference effects can amplify the desired measurement results. The design of quantum algorithms involves creating procedures that allow a quantum computer to perform calculations efficiently and quickly. - # Physically engineering high-quality qubits has proven challenging. If a physical qubit is not sufficiently isolated from its environment, it suffers from quantum decoherence, introducing noise into calculations. Paradoxically, perfectly isolating qubits is also undesirable because quantum computations typically need to initialize qubits, perform controlled qubit interactions, and measure the resulting quantum states. Each of those operations introduces errors and suffers from noise, and such inaccuracies accumulate. - # In principle, a non-quantum (classical) computer can solve the same computational problems as a quantum computer, given enough time. Quantum advantage comes in the form of time complexity rather than computability, and quantum complexity theory shows that some quantum algorithms for carefully selected tasks require exponentially fewer computational steps than the best known non-quantum algorithms. Such tasks can in theory be solved on a large-scale quantum computer whereas classical computers would not finish computations in any reasonable amount of time. However, quantum speedup is not universal or even typical across computational tasks, since basic tasks such as sorting are proven to not allow any asymptotic quantum speedup. Claims of quantum supremacy have drawn significant attention to the discipline, but are demonstrated on contrived tasks, while near-term practical use cases remain limited. - # """ - # - # await cognee.add([text], dataset_name_2) - # - # await cognee.cognify([dataset_name_2, dataset_name_1]) - # - # from cognee.infrastructure.databases.vector import get_vector_engine - # - # # Test getting of documents for search per dataset - # from cognee.modules.users.permissions.methods import get_document_ids_for_user - # user = await get_default_user() - # document_ids = await get_document_ids_for_user(user.id, [dataset_name_1]) - # assert len(document_ids) == 1, f"Number of expected documents doesn't match {len(document_ids)} != 1" - # - # # Test getting of documents for search when no dataset is provided - # user = await get_default_user() - # document_ids = await get_document_ids_for_user(user.id) - # assert len(document_ids) == 2, f"Number of expected documents doesn't match {len(document_ids)} != 2" - # - # vector_engine = get_vector_engine() - # random_node = (await vector_engine.search("entity_name", "Quantum computer"))[0] - # random_node_name = random_node.payload["text"] - # - # search_results = await cognee.search(SearchType.INSIGHTS, query_text = random_node_name) - # assert len(search_results) != 0, "The search results list is empty." - # print("\n\nExtracted sentences are:\n") - # for result in search_results: - # print(f"{result}\n") - # - # search_results = await cognee.search(SearchType.CHUNKS, query_text = random_node_name, datasets=[dataset_name_2]) - # assert len(search_results) != 0, "The search results list is empty." - # print("\n\nExtracted chunks are:\n") - # for result in search_results: - # print(f"{result}\n") - # - # search_results = await cognee.search(SearchType.SUMMARIES, query_text = random_node_name) - # assert len(search_results) != 0, "Query related summaries don't exist." - # print("\n\nExtracted summaries are:\n") - # for result in search_results: - # print(f"{result}\n") - # - # history = await cognee.get_search_history() - # assert len(history) == 6, "Search history is not correct." - # - # results = await brute_force_triplet_search('What is a quantum computer?') - # assert len(results) > 0 + text = """A quantum computer is a computer that takes advantage of quantum mechanical phenomena. + At small scales, physical matter exhibits properties of both particles and waves, and quantum computing leverages this behavior, specifically quantum superposition and entanglement, using specialized hardware that supports the preparation and manipulation of quantum states. + Classical physics cannot explain the operation of these quantum devices, and a scalable quantum computer could perform some calculations exponentially faster (with respect to input size scaling) than any modern "classical" computer. In particular, a large-scale quantum computer could break widely used encryption schemes and aid physicists in performing physical simulations; however, the current state of the technology is largely experimental and impractical, with several obstacles to useful applications. Moreover, scalable quantum computers do not hold promise for many practical tasks, and for many important tasks quantum speedups are proven impossible. + The basic unit of information in quantum computing is the qubit, similar to the bit in traditional digital electronics. Unlike a classical bit, a qubit can exist in a superposition of its two "basis" states. When measuring a qubit, the result is a probabilistic output of a classical bit, therefore making quantum computers nondeterministic in general. If a quantum computer manipulates the qubit in a particular way, wave interference effects can amplify the desired measurement results. The design of quantum algorithms involves creating procedures that allow a quantum computer to perform calculations efficiently and quickly. + Physically engineering high-quality qubits has proven challenging. If a physical qubit is not sufficiently isolated from its environment, it suffers from quantum decoherence, introducing noise into calculations. Paradoxically, perfectly isolating qubits is also undesirable because quantum computations typically need to initialize qubits, perform controlled qubit interactions, and measure the resulting quantum states. Each of those operations introduces errors and suffers from noise, and such inaccuracies accumulate. + In principle, a non-quantum (classical) computer can solve the same computational problems as a quantum computer, given enough time. Quantum advantage comes in the form of time complexity rather than computability, and quantum complexity theory shows that some quantum algorithms for carefully selected tasks require exponentially fewer computational steps than the best known non-quantum algorithms. Such tasks can in theory be solved on a large-scale quantum computer whereas classical computers would not finish computations in any reasonable amount of time. However, quantum speedup is not universal or even typical across computational tasks, since basic tasks such as sorting are proven to not allow any asymptotic quantum speedup. Claims of quantum supremacy have drawn significant attention to the discipline, but are demonstrated on contrived tasks, while near-term practical use cases remain limited. + """ - # await cognee.prune.prune_data() - # assert not os.path.isdir(data_directory_path), "Local data files are not deleted" - # - # await cognee.prune.prune_system(metadata=True) - # tables_in_database = await vector_engine.get_table_names() - # assert len(tables_in_database) == 0, "PostgreSQL database is not empty" + await cognee.add([text], dataset_name_2) + + await cognee.cognify([dataset_name_2, dataset_name_1]) + + from cognee.infrastructure.databases.vector import get_vector_engine + + # Test getting of documents for search per dataset + from cognee.modules.users.permissions.methods import get_document_ids_for_user + user = await get_default_user() + document_ids = await get_document_ids_for_user(user.id, [dataset_name_1]) + assert len(document_ids) == 1, f"Number of expected documents doesn't match {len(document_ids)} != 1" + + # Test getting of documents for search when no dataset is provided + user = await get_default_user() + document_ids = await get_document_ids_for_user(user.id) + assert len(document_ids) == 2, f"Number of expected documents doesn't match {len(document_ids)} != 2" + + vector_engine = get_vector_engine() + random_node = (await vector_engine.search("entity_name", "Quantum computer"))[0] + random_node_name = random_node.payload["text"] + + search_results = await cognee.search(SearchType.INSIGHTS, query_text = random_node_name) + assert len(search_results) != 0, "The search results list is empty." + print("\n\nExtracted sentences are:\n") + for result in search_results: + print(f"{result}\n") + + search_results = await cognee.search(SearchType.CHUNKS, query_text = random_node_name, datasets=[dataset_name_2]) + assert len(search_results) != 0, "The search results list is empty." + print("\n\nExtracted chunks are:\n") + for result in search_results: + print(f"{result}\n") + + search_results = await cognee.search(SearchType.SUMMARIES, query_text = random_node_name) + assert len(search_results) != 0, "Query related summaries don't exist." + print("\n\nExtracted summaries are:\n") + for result in search_results: + print(f"{result}\n") + + history = await cognee.get_search_history() + assert len(history) == 6, "Search history is not correct." + + results = await brute_force_triplet_search('What is a quantum computer?') + assert len(results) > 0 + + await cognee.prune.prune_data() + assert not os.path.isdir(data_directory_path), "Local data files are not deleted" + + await cognee.prune.prune_system(metadata=True) + tables_in_database = await vector_engine.get_table_names() + assert len(tables_in_database) == 0, "PostgreSQL database is not empty" if __name__ == "__main__": import asyncio From 1406b6f0781baa1dfab0b3348e36bc30d7368ee5 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Thu, 19 Dec 2024 17:30:02 +0100 Subject: [PATCH 6/8] test: Add test to verify deletion of local files Added test that checks local files created by cognee will be deleted and those not created by cognee won't Test COG-475 --- .../sqlalchemy/SqlAlchemyAdapter.py | 4 +- cognee/tests/test_pgvector.py | 53 +++++++++++++++---- 2 files changed, 45 insertions(+), 12 deletions(-) diff --git a/cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py b/cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py index 3b9a36e71..006f2ed98 100644 --- a/cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py +++ b/cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py @@ -128,7 +128,7 @@ class SQLAlchemyAdapter(): # Check if other data objects point to the same raw data location raw_data_location_entities = (await session.execute( - select(Data).where(Data.raw_data_location == data_entity.raw_data_location))).all() + select(Data.raw_data_location).where(Data.raw_data_location == data_entity.raw_data_location))).all() # Don't delete local file unless this is the only reference to the file in the database if len(raw_data_location_entities) == 1: @@ -136,7 +136,7 @@ class SQLAlchemyAdapter(): from cognee.base_config import get_base_config config = get_base_config() if config.data_root_directory in raw_data_location_entities[0].raw_data_location: - os.remove(raw_data_location_entities[0]) + os.remove(raw_data_location_entities[0].raw_data_location) await session.execute(delete(Data).where(Data.id == data_id)) await session.commit() diff --git a/cognee/tests/test_pgvector.py b/cognee/tests/test_pgvector.py index 9554a3f9d..9da34c3d4 100644 --- a/cognee/tests/test_pgvector.py +++ b/cognee/tests/test_pgvector.py @@ -2,12 +2,52 @@ import os import logging import pathlib import cognee + +from cognee.modules.data.models import Data from cognee.api.v1.search import SearchType from cognee.modules.retrieval.brute_force_triplet_search import brute_force_triplet_search from cognee.modules.users.methods import get_default_user logging.basicConfig(level=logging.DEBUG) +async def test_local_file_deletion(data_text, file_location): + from sqlalchemy import select + import hashlib + from cognee.infrastructure.databases.relational import get_relational_engine + + engine = get_relational_engine() + + async with engine.get_async_session() as session: + # Get hash of data contents + encoded_text = data_text.encode("utf-8") + data_hash = hashlib.md5(encoded_text).hexdigest() + # Get data entry from database based on hash contents + data = (await session.scalars(select(Data).where(Data.content_hash == data_hash))).one() + assert os.path.isfile(data.raw_data_location), f"Data location doesn't exist: {data.raw_data_location}" + # Test deletion of data along with local files created by cognee + await engine.delete_data_entity(data.id) + assert not os.path.exists(data.raw_data_location), f"Data location exists: {data.raw_data_location}" + + async with engine.get_async_session() as session: + # Get data entry from database based on file path + data = (await session.scalars(select(Data).where(Data.raw_data_location == file_location))).one() + assert os.path.isfile(data.raw_data_location), f"Data location doesn't exist: {data.raw_data_location}" + # Test local files not created by cognee won't get deleted + await engine.delete_data_entity(data.id) + assert os.path.exists(data.raw_data_location), f"Data location doesn't exists: {data.raw_data_location}" + +async def test_getting_of_documents(dataset_name_1): + # Test getting of documents for search per dataset + from cognee.modules.users.permissions.methods import get_document_ids_for_user + user = await get_default_user() + document_ids = await get_document_ids_for_user(user.id, [dataset_name_1]) + assert len(document_ids) == 1, f"Number of expected documents doesn't match {len(document_ids)} != 1" + + # Test getting of documents for search when no dataset is provided + user = await get_default_user() + document_ids = await get_document_ids_for_user(user.id) + assert len(document_ids) == 2, f"Number of expected documents doesn't match {len(document_ids)} != 2" + async def main(): cognee.config.set_vector_db_config( @@ -67,16 +107,7 @@ async def main(): from cognee.infrastructure.databases.vector import get_vector_engine - # Test getting of documents for search per dataset - from cognee.modules.users.permissions.methods import get_document_ids_for_user - user = await get_default_user() - document_ids = await get_document_ids_for_user(user.id, [dataset_name_1]) - assert len(document_ids) == 1, f"Number of expected documents doesn't match {len(document_ids)} != 1" - - # Test getting of documents for search when no dataset is provided - user = await get_default_user() - document_ids = await get_document_ids_for_user(user.id) - assert len(document_ids) == 2, f"Number of expected documents doesn't match {len(document_ids)} != 2" + await test_getting_of_documents(dataset_name_1) vector_engine = get_vector_engine() random_node = (await vector_engine.search("entity_name", "Quantum computer"))[0] @@ -106,6 +137,8 @@ async def main(): results = await brute_force_triplet_search('What is a quantum computer?') assert len(results) > 0 + await test_local_file_deletion(text, explanation_file_path) + await cognee.prune.prune_data() assert not os.path.isdir(data_directory_path), "Local data files are not deleted" From d7195f9786843f186cd2d9a13f3bebe14200f363 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Fri, 20 Dec 2024 15:17:35 +0100 Subject: [PATCH 7/8] test: Update test regarding message Update assertion message, add veryfing of file existence --- .../databases/relational/sqlalchemy/SqlAlchemyAdapter.py | 8 +++++++- cognee/tests/test_pgvector.py | 3 ++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py b/cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py index 006f2ed98..ce3a479e5 100644 --- a/cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py +++ b/cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py @@ -132,11 +132,17 @@ class SQLAlchemyAdapter(): # Don't delete local file unless this is the only reference to the file in the database if len(raw_data_location_entities) == 1: + # delete local file only if it's created by cognee from cognee.base_config import get_base_config config = get_base_config() + if config.data_root_directory in raw_data_location_entities[0].raw_data_location: - os.remove(raw_data_location_entities[0].raw_data_location) + if os.path.exists(raw_data_location_entities[0].raw_data_location): + os.remove(raw_data_location_entities[0].raw_data_location) + else: + # Report bug as file should exist + pass await session.execute(delete(Data).where(Data.id == data_id)) await session.commit() diff --git a/cognee/tests/test_pgvector.py b/cognee/tests/test_pgvector.py index 9da34c3d4..417904089 100644 --- a/cognee/tests/test_pgvector.py +++ b/cognee/tests/test_pgvector.py @@ -26,7 +26,8 @@ async def test_local_file_deletion(data_text, file_location): assert os.path.isfile(data.raw_data_location), f"Data location doesn't exist: {data.raw_data_location}" # Test deletion of data along with local files created by cognee await engine.delete_data_entity(data.id) - assert not os.path.exists(data.raw_data_location), f"Data location exists: {data.raw_data_location}" + assert not os.path.exists( + data.raw_data_location), f"Data location still exists after deletion: {data.raw_data_location}" async with engine.get_async_session() as session: # Get data entry from database based on file path From 450bef4c912335f65c6ccf4905af4636aded7364 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Fri, 20 Dec 2024 16:20:40 +0100 Subject: [PATCH 8/8] chore: Add logging to sentry when file which should exist can't be found Log to sentry that a file which should exist can't be found Chore COG-475 --- .../databases/relational/sqlalchemy/SqlAlchemyAdapter.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py b/cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py index ce3a479e5..c197efc72 100644 --- a/cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py +++ b/cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py @@ -1,5 +1,6 @@ import os from os import path +import logging from uuid import UUID from typing import Optional from typing import AsyncGenerator, List @@ -14,6 +15,9 @@ from cognee.modules.data.models.Data import Data from ..ModelBase import Base + +logger = logging.getLogger(__name__) + class SQLAlchemyAdapter(): def __init__(self, connection_string: str): self.db_path: str = None @@ -142,7 +146,7 @@ class SQLAlchemyAdapter(): os.remove(raw_data_location_entities[0].raw_data_location) else: # Report bug as file should exist - pass + logger.error("Local file which should exist can't be found.") await session.execute(delete(Data).where(Data.id == data_id)) await session.commit()