diff --git a/README.md b/README.md index a1eebae73..305bffdfe 100644 --- a/README.md +++ b/README.md @@ -71,7 +71,7 @@ Build dynamic memory for Agents and replace RAG using scalable, modular ECL (Ext ## Get Started -Get started quickly with a Google Colab notebook , Deepnote notebook or starter repo +Get started quickly with a Google Colab notebook , Deepnote notebook or starter repo ## About cognee @@ -224,12 +224,12 @@ We now have a paper you can cite: ```bibtex @misc{markovic2025optimizinginterfaceknowledgegraphs, - title={Optimizing the Interface Between Knowledge Graphs and LLMs for Complex Reasoning}, + title={Optimizing the Interface Between Knowledge Graphs and LLMs for Complex Reasoning}, author={Vasilije Markovic and Lazar Obradovic and Laszlo Hajdu and Jovan Pavlovic}, year={2025}, eprint={2505.24478}, archivePrefix={arXiv}, primaryClass={cs.AI}, - url={https://arxiv.org/abs/2505.24478}, + url={https://arxiv.org/abs/2505.24478}, } ``` diff --git a/cognee/api/v1/search/search.py b/cognee/api/v1/search/search.py index 4051bae86..0a9e76e96 100644 --- a/cognee/api/v1/search/search.py +++ b/cognee/api/v1/search/search.py @@ -1,7 +1,6 @@ from uuid import UUID from typing import Union, Optional, List, Type -from cognee.infrastructure.databases.graph import get_graph_engine from cognee.modules.engine.models.node_set import NodeSet from cognee.modules.users.models import User from cognee.modules.search.types import SearchResult, SearchType, CombinedSearchResult @@ -9,9 +8,6 @@ from cognee.modules.users.methods import get_default_user from cognee.modules.search.methods import search as search_function from cognee.modules.data.methods import get_authorized_existing_datasets from cognee.modules.data.exceptions import DatasetNotFoundError -from cognee.shared.logging_utils import get_logger - -logger = get_logger() async def search( diff --git a/cognee/infrastructure/databases/graph/graph_db_interface.py b/cognee/infrastructure/databases/graph/graph_db_interface.py index 67df1a27c..65afdf275 100644 --- a/cognee/infrastructure/databases/graph/graph_db_interface.py +++ b/cognee/infrastructure/databases/graph/graph_db_interface.py @@ -159,11 +159,6 @@ class GraphDBInterface(ABC): - get_connections """ - @abstractmethod - async def is_empty(self) -> bool: - logger.warning("is_empty() is not implemented") - return True - @abstractmethod async def query(self, query: str, params: dict) -> List[Any]: """ diff --git a/cognee/infrastructure/databases/graph/kuzu/adapter.py b/cognee/infrastructure/databases/graph/kuzu/adapter.py index 2d3866888..3f0fb0c57 100644 --- a/cognee/infrastructure/databases/graph/kuzu/adapter.py +++ b/cognee/infrastructure/databases/graph/kuzu/adapter.py @@ -198,15 +198,6 @@ class KuzuAdapter(GraphDBInterface): except FileNotFoundError: logger.warning(f"Kuzu S3 storage file not found: {self.db_path}") - async def is_empty(self) -> bool: - query = """ - MATCH (n) - RETURN true - LIMIT 1; - """ - query_result = await self.query(query) - return len(query_result) == 0 - async def query(self, query: str, params: Optional[dict] = None) -> List[Tuple]: """ Execute a Kuzu query asynchronously with automatic reconnection. diff --git a/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py b/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py index 5861b69cb..365d02979 100644 --- a/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py +++ b/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py @@ -87,15 +87,6 @@ class Neo4jAdapter(GraphDBInterface): async with self.driver.session(database=self.graph_database_name) as session: yield session - async def is_empty(self) -> bool: - query = """ - RETURN EXISTS { - MATCH (n) - } AS node_exists; - """ - query_result = await self.query(query) - return not query_result[0]["node_exists"] - @deadlock_retry() async def query( self, @@ -1076,7 +1067,7 @@ class Neo4jAdapter(GraphDBInterface): query_nodes = f""" MATCH (n) WHERE {where_clause} - RETURN ID(n) AS id, labels(n) AS labels, properties(n) AS properties + RETURN n.id AS id, labels(n) AS labels, properties(n) AS properties """ result_nodes = await self.query(query_nodes) @@ -1091,7 +1082,7 @@ class Neo4jAdapter(GraphDBInterface): query_edges = f""" MATCH (n)-[r]->(m) WHERE {where_clause} AND {where_clause.replace("n.", "m.")} - RETURN ID(n) AS source, ID(m) AS target, TYPE(r) AS type, properties(r) AS properties + RETURN n.id AS source, n.id AS target, TYPE(r) AS type, properties(r) AS properties """ result_edges = await self.query(query_edges) diff --git a/cognee/infrastructure/files/utils/guess_file_type.py b/cognee/infrastructure/files/utils/guess_file_type.py index edd2d89b0..dcdd68cad 100644 --- a/cognee/infrastructure/files/utils/guess_file_type.py +++ b/cognee/infrastructure/files/utils/guess_file_type.py @@ -124,6 +124,12 @@ def guess_file_type(file: BinaryIO) -> filetype.Type: """ file_type = filetype.guess(file) + # If file type could not be determined consider it a plain text file as they don't have magic number encoding + if file_type is None: + from filetype.types.base import Type + + file_type = Type("text/plain", "txt") + if file_type is None: raise FileTypeException(f"Unknown file detected: {file.name}.") diff --git a/cognee/modules/pipelines/operations/run_tasks_distributed.py b/cognee/modules/pipelines/operations/run_tasks_distributed.py index 95cdb0266..3fce3763d 100644 --- a/cognee/modules/pipelines/operations/run_tasks_distributed.py +++ b/cognee/modules/pipelines/operations/run_tasks_distributed.py @@ -88,6 +88,7 @@ async def run_tasks_distributed( pipeline_name: str = "unknown_pipeline", context: dict = None, incremental_loading: bool = False, + data_per_batch: int = 20, ): if not user: user = await get_default_user() diff --git a/cognee/tests/test_kuzu.py b/cognee/tests/test_kuzu.py index fe9da6dcb..8749e42d0 100644 --- a/cognee/tests/test_kuzu.py +++ b/cognee/tests/test_kuzu.py @@ -47,26 +47,10 @@ async def main(): pathlib.Path(__file__).parent, "test_data/Quantum_computers.txt" ) - from cognee.infrastructure.databases.graph import get_graph_engine - - graph_engine = await get_graph_engine() - - is_empty = await graph_engine.is_empty() - - assert is_empty, "Kuzu graph database is not empty" - await cognee.add([explanation_file_path_quantum], dataset_name) - is_empty = await graph_engine.is_empty() - - assert is_empty, "Kuzu graph database should be empty before cognify" - await cognee.cognify([dataset_name]) - is_empty = await graph_engine.is_empty() - - assert not is_empty, "Kuzu graph database should not be empty" - from cognee.infrastructure.databases.vector import get_vector_engine vector_engine = get_vector_engine() @@ -130,10 +114,11 @@ async def main(): assert not os.path.isdir(data_root_directory), "Local data files are not deleted" await cognee.prune.prune_system(metadata=True) + from cognee.infrastructure.databases.graph import get_graph_engine - is_empty = await graph_engine.is_empty() - - assert is_empty, "Kuzu graph database is not empty" + graph_engine = await get_graph_engine() + nodes, edges = await graph_engine.get_graph_data() + assert len(nodes) == 0 and len(edges) == 0, "Kuzu graph database is not empty" finally: # Ensure cleanup even if tests fail diff --git a/cognee/tests/test_neo4j.py b/cognee/tests/test_neo4j.py index 925614e67..c74b4ab65 100644 --- a/cognee/tests/test_neo4j.py +++ b/cognee/tests/test_neo4j.py @@ -35,14 +35,6 @@ async def main(): explanation_file_path_nlp = os.path.join( pathlib.Path(__file__).parent, "test_data/Natural_language_processing.txt" ) - from cognee.infrastructure.databases.graph import get_graph_engine - - graph_engine = await get_graph_engine() - - is_empty = await graph_engine.is_empty() - - assert is_empty, "Graph has to be empty" - await cognee.add([explanation_file_path_nlp], dataset_name) explanation_file_path_quantum = os.path.join( @@ -50,16 +42,9 @@ async def main(): ) await cognee.add([explanation_file_path_quantum], dataset_name) - is_empty = await graph_engine.is_empty() - - assert is_empty, "Graph has to be empty before cognify" await cognee.cognify([dataset_name]) - is_empty = await graph_engine.is_empty() - - assert not is_empty, "Graph shouldn't be empty" - from cognee.infrastructure.databases.vector import get_vector_engine vector_engine = get_vector_engine() @@ -132,8 +117,11 @@ async def main(): assert not os.path.isdir(data_root_directory), "Local data files are not deleted" await cognee.prune.prune_system(metadata=True) - is_empty = await graph_engine.is_empty() - assert is_empty, "Neo4j graph database is not empty" + from cognee.infrastructure.databases.graph import get_graph_engine + + graph_engine = await get_graph_engine() + nodes, edges = await graph_engine.get_graph_data() + assert len(nodes) == 0 and len(edges) == 0, "Neo4j graph database is not empty" if __name__ == "__main__": diff --git a/cognee/tests/unit/api/test_search.py b/cognee/tests/unit/api/test_search.py deleted file mode 100644 index 54a4cc35f..000000000 --- a/cognee/tests/unit/api/test_search.py +++ /dev/null @@ -1,21 +0,0 @@ -import pytest -import cognee - - -@pytest.mark.asyncio -async def test_empty_search_raises_SearchOnEmptyGraphError_on_empty_graph(): - await cognee.prune.prune_data() - await cognee.prune.prune_system(metadata=True) - await cognee.add("Sample input") - result = await cognee.search("Sample query") - assert result == [] - - -@pytest.mark.asyncio -async def test_empty_search_doesnt_raise_SearchOnEmptyGraphError(): - await cognee.prune.prune_data() - await cognee.prune.prune_system(metadata=True) - await cognee.add("Sample input") - await cognee.cognify() - result = await cognee.search("Sample query") - assert result != []