diff --git a/README.md b/README.md
index a1eebae73..305bffdfe 100644
--- a/README.md
+++ b/README.md
@@ -71,7 +71,7 @@ Build dynamic memory for Agents and replace RAG using scalable, modular ECL (Ext
## Get Started
-Get started quickly with a Google Colab notebook , Deepnote notebook or starter repo
+Get started quickly with a Google Colab notebook , Deepnote notebook or starter repo
## About cognee
@@ -224,12 +224,12 @@ We now have a paper you can cite:
```bibtex
@misc{markovic2025optimizinginterfaceknowledgegraphs,
- title={Optimizing the Interface Between Knowledge Graphs and LLMs for Complex Reasoning},
+ title={Optimizing the Interface Between Knowledge Graphs and LLMs for Complex Reasoning},
author={Vasilije Markovic and Lazar Obradovic and Laszlo Hajdu and Jovan Pavlovic},
year={2025},
eprint={2505.24478},
archivePrefix={arXiv},
primaryClass={cs.AI},
- url={https://arxiv.org/abs/2505.24478},
+ url={https://arxiv.org/abs/2505.24478},
}
```
diff --git a/cognee/api/v1/search/search.py b/cognee/api/v1/search/search.py
index 4051bae86..0a9e76e96 100644
--- a/cognee/api/v1/search/search.py
+++ b/cognee/api/v1/search/search.py
@@ -1,7 +1,6 @@
from uuid import UUID
from typing import Union, Optional, List, Type
-from cognee.infrastructure.databases.graph import get_graph_engine
from cognee.modules.engine.models.node_set import NodeSet
from cognee.modules.users.models import User
from cognee.modules.search.types import SearchResult, SearchType, CombinedSearchResult
@@ -9,9 +8,6 @@ from cognee.modules.users.methods import get_default_user
from cognee.modules.search.methods import search as search_function
from cognee.modules.data.methods import get_authorized_existing_datasets
from cognee.modules.data.exceptions import DatasetNotFoundError
-from cognee.shared.logging_utils import get_logger
-
-logger = get_logger()
async def search(
diff --git a/cognee/infrastructure/databases/graph/graph_db_interface.py b/cognee/infrastructure/databases/graph/graph_db_interface.py
index 67df1a27c..65afdf275 100644
--- a/cognee/infrastructure/databases/graph/graph_db_interface.py
+++ b/cognee/infrastructure/databases/graph/graph_db_interface.py
@@ -159,11 +159,6 @@ class GraphDBInterface(ABC):
- get_connections
"""
- @abstractmethod
- async def is_empty(self) -> bool:
- logger.warning("is_empty() is not implemented")
- return True
-
@abstractmethod
async def query(self, query: str, params: dict) -> List[Any]:
"""
diff --git a/cognee/infrastructure/databases/graph/kuzu/adapter.py b/cognee/infrastructure/databases/graph/kuzu/adapter.py
index 2d3866888..3f0fb0c57 100644
--- a/cognee/infrastructure/databases/graph/kuzu/adapter.py
+++ b/cognee/infrastructure/databases/graph/kuzu/adapter.py
@@ -198,15 +198,6 @@ class KuzuAdapter(GraphDBInterface):
except FileNotFoundError:
logger.warning(f"Kuzu S3 storage file not found: {self.db_path}")
- async def is_empty(self) -> bool:
- query = """
- MATCH (n)
- RETURN true
- LIMIT 1;
- """
- query_result = await self.query(query)
- return len(query_result) == 0
-
async def query(self, query: str, params: Optional[dict] = None) -> List[Tuple]:
"""
Execute a Kuzu query asynchronously with automatic reconnection.
diff --git a/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py b/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py
index 5861b69cb..365d02979 100644
--- a/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py
+++ b/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py
@@ -87,15 +87,6 @@ class Neo4jAdapter(GraphDBInterface):
async with self.driver.session(database=self.graph_database_name) as session:
yield session
- async def is_empty(self) -> bool:
- query = """
- RETURN EXISTS {
- MATCH (n)
- } AS node_exists;
- """
- query_result = await self.query(query)
- return not query_result[0]["node_exists"]
-
@deadlock_retry()
async def query(
self,
@@ -1076,7 +1067,7 @@ class Neo4jAdapter(GraphDBInterface):
query_nodes = f"""
MATCH (n)
WHERE {where_clause}
- RETURN ID(n) AS id, labels(n) AS labels, properties(n) AS properties
+ RETURN n.id AS id, labels(n) AS labels, properties(n) AS properties
"""
result_nodes = await self.query(query_nodes)
@@ -1091,7 +1082,7 @@ class Neo4jAdapter(GraphDBInterface):
query_edges = f"""
MATCH (n)-[r]->(m)
WHERE {where_clause} AND {where_clause.replace("n.", "m.")}
- RETURN ID(n) AS source, ID(m) AS target, TYPE(r) AS type, properties(r) AS properties
+ RETURN n.id AS source, n.id AS target, TYPE(r) AS type, properties(r) AS properties
"""
result_edges = await self.query(query_edges)
diff --git a/cognee/infrastructure/files/utils/guess_file_type.py b/cognee/infrastructure/files/utils/guess_file_type.py
index edd2d89b0..dcdd68cad 100644
--- a/cognee/infrastructure/files/utils/guess_file_type.py
+++ b/cognee/infrastructure/files/utils/guess_file_type.py
@@ -124,6 +124,12 @@ def guess_file_type(file: BinaryIO) -> filetype.Type:
"""
file_type = filetype.guess(file)
+ # If file type could not be determined consider it a plain text file as they don't have magic number encoding
+ if file_type is None:
+ from filetype.types.base import Type
+
+ file_type = Type("text/plain", "txt")
+
if file_type is None:
raise FileTypeException(f"Unknown file detected: {file.name}.")
diff --git a/cognee/modules/pipelines/operations/run_tasks_distributed.py b/cognee/modules/pipelines/operations/run_tasks_distributed.py
index 95cdb0266..3fce3763d 100644
--- a/cognee/modules/pipelines/operations/run_tasks_distributed.py
+++ b/cognee/modules/pipelines/operations/run_tasks_distributed.py
@@ -88,6 +88,7 @@ async def run_tasks_distributed(
pipeline_name: str = "unknown_pipeline",
context: dict = None,
incremental_loading: bool = False,
+ data_per_batch: int = 20,
):
if not user:
user = await get_default_user()
diff --git a/cognee/tests/test_kuzu.py b/cognee/tests/test_kuzu.py
index fe9da6dcb..8749e42d0 100644
--- a/cognee/tests/test_kuzu.py
+++ b/cognee/tests/test_kuzu.py
@@ -47,26 +47,10 @@ async def main():
pathlib.Path(__file__).parent, "test_data/Quantum_computers.txt"
)
- from cognee.infrastructure.databases.graph import get_graph_engine
-
- graph_engine = await get_graph_engine()
-
- is_empty = await graph_engine.is_empty()
-
- assert is_empty, "Kuzu graph database is not empty"
-
await cognee.add([explanation_file_path_quantum], dataset_name)
- is_empty = await graph_engine.is_empty()
-
- assert is_empty, "Kuzu graph database should be empty before cognify"
-
await cognee.cognify([dataset_name])
- is_empty = await graph_engine.is_empty()
-
- assert not is_empty, "Kuzu graph database should not be empty"
-
from cognee.infrastructure.databases.vector import get_vector_engine
vector_engine = get_vector_engine()
@@ -130,10 +114,11 @@ async def main():
assert not os.path.isdir(data_root_directory), "Local data files are not deleted"
await cognee.prune.prune_system(metadata=True)
+ from cognee.infrastructure.databases.graph import get_graph_engine
- is_empty = await graph_engine.is_empty()
-
- assert is_empty, "Kuzu graph database is not empty"
+ graph_engine = await get_graph_engine()
+ nodes, edges = await graph_engine.get_graph_data()
+ assert len(nodes) == 0 and len(edges) == 0, "Kuzu graph database is not empty"
finally:
# Ensure cleanup even if tests fail
diff --git a/cognee/tests/test_neo4j.py b/cognee/tests/test_neo4j.py
index 925614e67..c74b4ab65 100644
--- a/cognee/tests/test_neo4j.py
+++ b/cognee/tests/test_neo4j.py
@@ -35,14 +35,6 @@ async def main():
explanation_file_path_nlp = os.path.join(
pathlib.Path(__file__).parent, "test_data/Natural_language_processing.txt"
)
- from cognee.infrastructure.databases.graph import get_graph_engine
-
- graph_engine = await get_graph_engine()
-
- is_empty = await graph_engine.is_empty()
-
- assert is_empty, "Graph has to be empty"
-
await cognee.add([explanation_file_path_nlp], dataset_name)
explanation_file_path_quantum = os.path.join(
@@ -50,16 +42,9 @@ async def main():
)
await cognee.add([explanation_file_path_quantum], dataset_name)
- is_empty = await graph_engine.is_empty()
-
- assert is_empty, "Graph has to be empty before cognify"
await cognee.cognify([dataset_name])
- is_empty = await graph_engine.is_empty()
-
- assert not is_empty, "Graph shouldn't be empty"
-
from cognee.infrastructure.databases.vector import get_vector_engine
vector_engine = get_vector_engine()
@@ -132,8 +117,11 @@ async def main():
assert not os.path.isdir(data_root_directory), "Local data files are not deleted"
await cognee.prune.prune_system(metadata=True)
- is_empty = await graph_engine.is_empty()
- assert is_empty, "Neo4j graph database is not empty"
+ from cognee.infrastructure.databases.graph import get_graph_engine
+
+ graph_engine = await get_graph_engine()
+ nodes, edges = await graph_engine.get_graph_data()
+ assert len(nodes) == 0 and len(edges) == 0, "Neo4j graph database is not empty"
if __name__ == "__main__":
diff --git a/cognee/tests/unit/api/test_search.py b/cognee/tests/unit/api/test_search.py
deleted file mode 100644
index 54a4cc35f..000000000
--- a/cognee/tests/unit/api/test_search.py
+++ /dev/null
@@ -1,21 +0,0 @@
-import pytest
-import cognee
-
-
-@pytest.mark.asyncio
-async def test_empty_search_raises_SearchOnEmptyGraphError_on_empty_graph():
- await cognee.prune.prune_data()
- await cognee.prune.prune_system(metadata=True)
- await cognee.add("Sample input")
- result = await cognee.search("Sample query")
- assert result == []
-
-
-@pytest.mark.asyncio
-async def test_empty_search_doesnt_raise_SearchOnEmptyGraphError():
- await cognee.prune.prune_data()
- await cognee.prune.prune_system(metadata=True)
- await cognee.add("Sample input")
- await cognee.cognify()
- result = await cognee.search("Sample query")
- assert result != []