From ac5118ee34c4bd149ac26d042e2ffe5292ee3459 Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Wed, 15 Oct 2025 17:28:51 +0200 Subject: [PATCH 01/28] test:Add load test --- cognee/tests/load_test.py | 61 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100644 cognee/tests/load_test.py diff --git a/cognee/tests/load_test.py b/cognee/tests/load_test.py new file mode 100644 index 000000000..da9b74ab9 --- /dev/null +++ b/cognee/tests/load_test.py @@ -0,0 +1,61 @@ +import os +import pathlib +import asyncio +import time + +import cognee +from cognee.modules.search.types import SearchType +from cognee.shared.logging_utils import get_logger + +logger = get_logger() + +async def helper_func(num_of_searches): + + start_time = time.time() + + await cognee.cognify() + + await asyncio.gather( + *[ + cognee.search(query_text="Tell me about AI", query_type=SearchType.GRAPH_COMPLETION) + for _ in range(num_of_searches) + ] + ) + + end_time = time.time() + + return end_time - start_time + +async def main(): + + file_path = os.path.join( + pathlib.Path(__file__).resolve().parent, "test_data/artificial-intelligence.pdf" + ) + + num_of_pdfs = 10 + num_of_reps = 5 + upper_boundary_minutes = 3 + average_minutes = 1.5 + + await asyncio.gather( + *[ + cognee.add(file_path, dataset_name=f"dataset_{i}") + for i in range(num_of_pdfs) + ] + ) + + recorded_times = await asyncio.gather( + *[helper_func(num_of_pdfs) for _ in range(num_of_reps)] + ) + + average_recorded_time = sum(recorded_times) / len(recorded_times) + + assert average_recorded_time <= average_minutes * 60 + + assert all(rec_time <= upper_boundary_minutes * 60 for rec_time in recorded_times) + + return + + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file From c16459d236a6e07b9267d323387b2be217fd5b46 Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Wed, 15 Oct 2025 17:58:05 +0200 Subject: [PATCH 02/28] test: Add prune step to the test --- cognee/tests/load_test.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/cognee/tests/load_test.py b/cognee/tests/load_test.py index da9b74ab9..c44efad00 100644 --- a/cognee/tests/load_test.py +++ b/cognee/tests/load_test.py @@ -9,7 +9,7 @@ from cognee.shared.logging_utils import get_logger logger = get_logger() -async def helper_func(num_of_searches): +async def process_and_search(num_of_searches): start_time = time.time() @@ -37,6 +37,9 @@ async def main(): upper_boundary_minutes = 3 average_minutes = 1.5 + await cognee.prune.prune_data() + await cognee.prune.prune_system(metadata=True) + await asyncio.gather( *[ cognee.add(file_path, dataset_name=f"dataset_{i}") @@ -45,7 +48,7 @@ async def main(): ) recorded_times = await asyncio.gather( - *[helper_func(num_of_pdfs) for _ in range(num_of_reps)] + *[process_and_search(num_of_pdfs) for _ in range(num_of_reps)] ) average_recorded_time = sum(recorded_times) / len(recorded_times) @@ -54,8 +57,6 @@ async def main(): assert all(rec_time <= upper_boundary_minutes * 60 for rec_time in recorded_times) - return - if __name__ == "__main__": asyncio.run(main()) \ No newline at end of file From c5648e63375d9eb1520f5a007dda520f70c9c145 Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Wed, 22 Oct 2025 09:22:11 +0200 Subject: [PATCH 03/28] test: Add load test. --- .github/workflows/e2e_tests.yml | 31 ++++++++++++++++++++- cognee/tests/{load_test.py => test_load.py} | 30 ++++++++++++-------- 2 files changed, 49 insertions(+), 12 deletions(-) rename cognee/tests/{load_test.py => test_load.py} (65%) diff --git a/.github/workflows/e2e_tests.yml b/.github/workflows/e2e_tests.yml index 9582a3f3b..5f66e71d2 100644 --- a/.github/workflows/e2e_tests.yml +++ b/.github/workflows/e2e_tests.yml @@ -330,4 +330,33 @@ jobs: DB_PORT: 5432 DB_USERNAME: cognee DB_PASSWORD: cognee - run: uv run python ./cognee/tests/test_concurrent_subprocess_access.py \ No newline at end of file + run: uv run python ./cognee/tests/test_concurrent_subprocess_access.py + + test-load: + name: Test Load + runs-on: ubuntu-22.04 + steps: + - name: Check out repository + uses: actions/checkout@v4 + + - name: Cognee Setup + uses: ./.github/actions/cognee_setup + with: + python-version: '3.11.x' + + - name: Dependencies already installed + run: echo "Dependencies already installed in setup" + + - name: Run Load Test + env: + ENV: 'dev' + ENABLE_BACKEND_ACCESS_CONTROL: True + LLM_MODEL: ${{ secrets.LLM_MODEL }} + LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} + LLM_API_KEY: ${{ secrets.LLM_API_KEY }} + LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} + EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} + EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} + EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} + EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} + run: uv run python ./cognee/tests/test_load.py \ No newline at end of file diff --git a/cognee/tests/load_test.py b/cognee/tests/test_load.py similarity index 65% rename from cognee/tests/load_test.py rename to cognee/tests/test_load.py index c44efad00..09e2db084 100644 --- a/cognee/tests/load_test.py +++ b/cognee/tests/test_load.py @@ -9,8 +9,8 @@ from cognee.shared.logging_utils import get_logger logger = get_logger() -async def process_and_search(num_of_searches): +async def process_and_search(num_of_searches): start_time = time.time() await cognee.cognify() @@ -26,26 +26,34 @@ async def process_and_search(num_of_searches): return end_time - start_time -async def main(): +async def main(): file_path = os.path.join( pathlib.Path(__file__).resolve().parent, "test_data/artificial-intelligence.pdf" ) + data_directory_path = str( + pathlib.Path( + os.path.join(pathlib.Path(__file__).parent, ".data_storage/test_load") + ).resolve() + ) + cognee.config.data_root_directory(data_directory_path) + cognee_directory_path = str( + pathlib.Path( + os.path.join(pathlib.Path(__file__).parent, ".cognee_system/test_load") + ).resolve() + ) + cognee.config.system_root_directory(cognee_directory_path) num_of_pdfs = 10 num_of_reps = 5 - upper_boundary_minutes = 3 - average_minutes = 1.5 + upper_boundary_minutes = 10 + average_minutes = 8 await cognee.prune.prune_data() await cognee.prune.prune_system(metadata=True) - await asyncio.gather( - *[ - cognee.add(file_path, dataset_name=f"dataset_{i}") - for i in range(num_of_pdfs) - ] - ) + for i in range(num_of_pdfs): + await cognee.add(file_path, dataset_name=f"dataset_{i}") recorded_times = await asyncio.gather( *[process_and_search(num_of_pdfs) for _ in range(num_of_reps)] @@ -59,4 +67,4 @@ async def main(): if __name__ == "__main__": - asyncio.run(main()) \ No newline at end of file + asyncio.run(main()) From 897fbd2f09abfc1c3c5cc30fc2fcf17ed549ae80 Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Mon, 27 Oct 2025 15:42:09 +0100 Subject: [PATCH 04/28] load test now uses s3 bucket --- cognee/tests/test_load.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/cognee/tests/test_load.py b/cognee/tests/test_load.py index 09e2db084..f8d007d28 100644 --- a/cognee/tests/test_load.py +++ b/cognee/tests/test_load.py @@ -17,7 +17,9 @@ async def process_and_search(num_of_searches): await asyncio.gather( *[ - cognee.search(query_text="Tell me about AI", query_type=SearchType.GRAPH_COMPLETION) + cognee.search( + query_text="Tell me about the document", query_type=SearchType.GRAPH_COMPLETION + ) for _ in range(num_of_searches) ] ) @@ -28,9 +30,6 @@ async def process_and_search(num_of_searches): async def main(): - file_path = os.path.join( - pathlib.Path(__file__).resolve().parent, "test_data/artificial-intelligence.pdf" - ) data_directory_path = str( pathlib.Path( os.path.join(pathlib.Path(__file__).parent, ".data_storage/test_load") @@ -52,8 +51,8 @@ async def main(): await cognee.prune.prune_data() await cognee.prune.prune_system(metadata=True) - for i in range(num_of_pdfs): - await cognee.add(file_path, dataset_name=f"dataset_{i}") + s3_input = "s3://cognee-load-test-s3-bucket" + await cognee.add(s3_input) recorded_times = await asyncio.gather( *[process_and_search(num_of_pdfs) for _ in range(num_of_reps)] From 2b083dd0f110e44341d30a6228abb18591cfabac Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Tue, 28 Oct 2025 09:27:33 +0100 Subject: [PATCH 05/28] small changes to load test --- cognee/tests/test_load.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/cognee/tests/test_load.py b/cognee/tests/test_load.py index f8d007d28..a09ce053d 100644 --- a/cognee/tests/test_load.py +++ b/cognee/tests/test_load.py @@ -48,15 +48,15 @@ async def main(): upper_boundary_minutes = 10 average_minutes = 8 - await cognee.prune.prune_data() - await cognee.prune.prune_system(metadata=True) + recorded_times = [] + for _ in range(num_of_reps): + await cognee.prune.prune_data() + await cognee.prune.prune_system(metadata=True) - s3_input = "s3://cognee-load-test-s3-bucket" - await cognee.add(s3_input) + s3_input = "s3://cognee-test-load-s3-bucket" + await cognee.add(s3_input) - recorded_times = await asyncio.gather( - *[process_and_search(num_of_pdfs) for _ in range(num_of_reps)] - ) + recorded_times.append(await process_and_search(num_of_pdfs)) average_recorded_time = sum(recorded_times) / len(recorded_times) From fb7e74eaa8d8bdae50021e0a24e7e6bf6bfc2a09 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Wed, 29 Oct 2025 16:28:09 +0100 Subject: [PATCH 06/28] refactor: Enable multi user mode by default if graph and vector db providers support it --- .env.template | 5 ++- cognee/context_global_variables.py | 43 ++++++++++++++++++- cognee/modules/search/methods/search.py | 6 +-- .../users/methods/get_authenticated_user.py | 3 +- .../relational_database_migration_example.py | 3 ++ logs/.gitkeep | 0 logs/README.md | 31 ------------- 7 files changed, 53 insertions(+), 38 deletions(-) delete mode 100644 logs/.gitkeep delete mode 100644 logs/README.md diff --git a/.env.template b/.env.template index 89ac06830..8e1bdd23f 100644 --- a/.env.template +++ b/.env.template @@ -169,8 +169,9 @@ REQUIRE_AUTHENTICATION=False # Vector: LanceDB # Graph: KuzuDB # -# It enforces LanceDB and KuzuDB use and uses them to create databases per Cognee user + dataset -ENABLE_BACKEND_ACCESS_CONTROL=False +# It enforces creation of databases per Cognee user + dataset. Does not work with some graph and database providers. +# Disable mode when using not supported graph/vector databases. +ENABLE_BACKEND_ACCESS_CONTROL=True ################################################################################ # ☁️ Cloud Sync Settings diff --git a/cognee/context_global_variables.py b/cognee/context_global_variables.py index d52de4b4e..8ad855724 100644 --- a/cognee/context_global_variables.py +++ b/cognee/context_global_variables.py @@ -4,6 +4,8 @@ from typing import Union from uuid import UUID from cognee.base_config import get_base_config +from cognee.infrastructure.databases.vector.config import get_vectordb_context_config +from cognee.infrastructure.databases.graph.config import get_graph_context_config from cognee.infrastructure.databases.utils import get_or_create_dataset_database from cognee.infrastructure.files.storage.config import file_storage_config from cognee.modules.users.methods import get_user @@ -14,11 +16,50 @@ vector_db_config = ContextVar("vector_db_config", default=None) graph_db_config = ContextVar("graph_db_config", default=None) session_user = ContextVar("session_user", default=None) +vector_dbs_with_multi_user_support = ["lancedb"] +graph_dbs_with_multi_user_support = ["kuzu"] + async def set_session_user_context_variable(user): session_user.set(user) +def check_multi_user_support(): + graph_db_config = get_graph_context_config() + vector_db_config = get_vectordb_context_config() + if ( + graph_db_config["graph_database_provider"] in graph_dbs_with_multi_user_support + and vector_db_config["vector_db_provider"] in vector_dbs_with_multi_user_support + ): + return True + else: + return False + + +def check_backend_access_control_mode(): + backend_access_control = os.environ.get("ENABLE_BACKEND_ACCESS_CONTROL", None) + if backend_access_control is None: + # If backend access control is not defined in environment variables, + # enable it by default if graph and vector DBs can support it, otherwise disable it + multi_user_support = check_multi_user_support() + if multi_user_support: + return "true" + else: + return "false" + elif backend_access_control.lower() == "true": + # If enabled, ensure that the current graph and vector DBs can support it + multi_user_support = check_multi_user_support() + if not multi_user_support: + raise EnvironmentError( + "ENABLE_BACKEND_ACCESS_CONTROL is set to true but the current graph and/or vector databases do not support multi-user access control. Please use supported databases or disable backend access control." + ) + else: + return "true" + else: + # If explicitly disabled, return false + return "false" + + async def set_database_global_context_variables(dataset: Union[str, UUID], user_id: UUID): """ If backend access control is enabled this function will ensure all datasets have their own databases, @@ -40,7 +81,7 @@ async def set_database_global_context_variables(dataset: Union[str, UUID], user_ base_config = get_base_config() - if not os.getenv("ENABLE_BACKEND_ACCESS_CONTROL", "false").lower() == "true": + if not check_backend_access_control_mode() == "true": return user = await get_user(user_id) diff --git a/cognee/modules/search/methods/search.py b/cognee/modules/search/methods/search.py index aab004924..e3d7c220e 100644 --- a/cognee/modules/search/methods/search.py +++ b/cognee/modules/search/methods/search.py @@ -1,4 +1,3 @@ -import os import json import asyncio from uuid import UUID @@ -9,6 +8,7 @@ from cognee.infrastructure.databases.graph import get_graph_engine from cognee.shared.logging_utils import get_logger from cognee.shared.utils import send_telemetry from cognee.context_global_variables import set_database_global_context_variables +from cognee.context_global_variables import check_backend_access_control_mode from cognee.modules.engine.models.node_set import NodeSet from cognee.modules.graph.cognee_graph.CogneeGraphElements import Edge @@ -74,7 +74,7 @@ async def search( ) # Use search function filtered by permissions if access control is enabled - if os.getenv("ENABLE_BACKEND_ACCESS_CONTROL", "false").lower() == "true": + if check_backend_access_control_mode() == "true": search_results = await authorized_search( query_type=query_type, query_text=query_text, @@ -156,7 +156,7 @@ async def search( ) else: # This is for maintaining backwards compatibility - if os.getenv("ENABLE_BACKEND_ACCESS_CONTROL", "false").lower() == "true": + if check_backend_access_control_mode() == "true": return_value = [] for search_result in search_results: prepared_search_results = await prepare_search_result(search_result) diff --git a/cognee/modules/users/methods/get_authenticated_user.py b/cognee/modules/users/methods/get_authenticated_user.py index d78215892..34d82586e 100644 --- a/cognee/modules/users/methods/get_authenticated_user.py +++ b/cognee/modules/users/methods/get_authenticated_user.py @@ -5,6 +5,7 @@ from ..models import User from ..get_fastapi_users import get_fastapi_users from .get_default_user import get_default_user from cognee.shared.logging_utils import get_logger +from cognee.context_global_variables import check_backend_access_control_mode logger = get_logger("get_authenticated_user") @@ -12,7 +13,7 @@ logger = get_logger("get_authenticated_user") # Check environment variable to determine authentication requirement REQUIRE_AUTHENTICATION = ( os.getenv("REQUIRE_AUTHENTICATION", "false").lower() == "true" - or os.getenv("ENABLE_BACKEND_ACCESS_CONTROL", "false").lower() == "true" + or check_backend_access_control_mode() == "true" ) fastapi_users = get_fastapi_users() diff --git a/examples/python/relational_database_migration_example.py b/examples/python/relational_database_migration_example.py index 7e87347bc..98482cb4b 100644 --- a/examples/python/relational_database_migration_example.py +++ b/examples/python/relational_database_migration_example.py @@ -31,6 +31,9 @@ from cognee.infrastructure.databases.vector.pgvector import ( async def main(): + # Disable backend access control to migrate relational data + os.environ["ENABLE_BACKEND_ACCESS_CONTROL"] = "false" + # Clean all data stored in Cognee await cognee.prune.prune_data() await cognee.prune.prune_system(metadata=True) diff --git a/logs/.gitkeep b/logs/.gitkeep deleted file mode 100644 index e69de29bb..000000000 diff --git a/logs/README.md b/logs/README.md deleted file mode 100644 index 96ef613b5..000000000 --- a/logs/README.md +++ /dev/null @@ -1,31 +0,0 @@ -# Logs Directory - -This directory contains the application logs for Cognee. - -## Log Files - -- Log files are named by date in the format `YYYY-MM-DD_HH-MM-SS.log` -- Logs are stored in plain text format with a consistent structure -- Each log entry includes: - - Timestamp (ISO format) - - Log level (padded to consistent width) - - Message - - Additional context (if any) - - Logger name (in square brackets) -- Exception tracebacks are included for error logs - -## Sample Log Entry - -``` -2025-03-27T13:05:27.481446Z [INFO ] Structured log message user_id=user123 action=login status=success [TestLogger] -``` - -## Retention Policy - -The system automatically keeps only the 10 most recent log files. Older log files are automatically deleted when new log files are created. This prevents excessive disk usage in long-running deployments. - -## Usage - -Logs are automatically generated by the application's logging mechanism. No manual actions are required to use this feature. - -The logs directory structure is preserved in version control, but the log files themselves are gitignored. From 6a7660a7c10892657422307b90788d0d6f80b8ab Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Wed, 29 Oct 2025 16:31:42 +0100 Subject: [PATCH 07/28] refactor: Return logs folder --- logs/.gitkeep | 0 logs/README.md | 31 +++++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+) create mode 100644 logs/.gitkeep create mode 100644 logs/README.md diff --git a/logs/.gitkeep b/logs/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/logs/README.md b/logs/README.md new file mode 100644 index 000000000..96ef613b5 --- /dev/null +++ b/logs/README.md @@ -0,0 +1,31 @@ +# Logs Directory + +This directory contains the application logs for Cognee. + +## Log Files + +- Log files are named by date in the format `YYYY-MM-DD_HH-MM-SS.log` +- Logs are stored in plain text format with a consistent structure +- Each log entry includes: + - Timestamp (ISO format) + - Log level (padded to consistent width) + - Message + - Additional context (if any) + - Logger name (in square brackets) +- Exception tracebacks are included for error logs + +## Sample Log Entry + +``` +2025-03-27T13:05:27.481446Z [INFO ] Structured log message user_id=user123 action=login status=success [TestLogger] +``` + +## Retention Policy + +The system automatically keeps only the 10 most recent log files. Older log files are automatically deleted when new log files are created. This prevents excessive disk usage in long-running deployments. + +## Usage + +Logs are automatically generated by the application's logging mechanism. No manual actions are required to use this feature. + +The logs directory structure is preserved in version control, but the log files themselves are gitignored. From 6572cf5cb9bcd7bc3906dc9149a22759069e79b2 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Wed, 29 Oct 2025 16:35:44 +0100 Subject: [PATCH 08/28] refactor: use boolean instead of string --- cognee/context_global_variables.py | 10 +++++----- cognee/modules/search/methods/search.py | 4 ++-- cognee/modules/users/methods/get_authenticated_user.py | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/cognee/context_global_variables.py b/cognee/context_global_variables.py index 8ad855724..b4b848192 100644 --- a/cognee/context_global_variables.py +++ b/cognee/context_global_variables.py @@ -43,9 +43,9 @@ def check_backend_access_control_mode(): # enable it by default if graph and vector DBs can support it, otherwise disable it multi_user_support = check_multi_user_support() if multi_user_support: - return "true" + return True else: - return "false" + return False elif backend_access_control.lower() == "true": # If enabled, ensure that the current graph and vector DBs can support it multi_user_support = check_multi_user_support() @@ -54,10 +54,10 @@ def check_backend_access_control_mode(): "ENABLE_BACKEND_ACCESS_CONTROL is set to true but the current graph and/or vector databases do not support multi-user access control. Please use supported databases or disable backend access control." ) else: - return "true" + return True else: # If explicitly disabled, return false - return "false" + return False async def set_database_global_context_variables(dataset: Union[str, UUID], user_id: UUID): @@ -81,7 +81,7 @@ async def set_database_global_context_variables(dataset: Union[str, UUID], user_ base_config = get_base_config() - if not check_backend_access_control_mode() == "true": + if not check_backend_access_control_mode(): return user = await get_user(user_id) diff --git a/cognee/modules/search/methods/search.py b/cognee/modules/search/methods/search.py index e3d7c220e..4a67093e8 100644 --- a/cognee/modules/search/methods/search.py +++ b/cognee/modules/search/methods/search.py @@ -74,7 +74,7 @@ async def search( ) # Use search function filtered by permissions if access control is enabled - if check_backend_access_control_mode() == "true": + if check_backend_access_control_mode(): search_results = await authorized_search( query_type=query_type, query_text=query_text, @@ -156,7 +156,7 @@ async def search( ) else: # This is for maintaining backwards compatibility - if check_backend_access_control_mode() == "true": + if check_backend_access_control_mode(): return_value = [] for search_result in search_results: prepared_search_results = await prepare_search_result(search_result) diff --git a/cognee/modules/users/methods/get_authenticated_user.py b/cognee/modules/users/methods/get_authenticated_user.py index 34d82586e..3cc16f3a8 100644 --- a/cognee/modules/users/methods/get_authenticated_user.py +++ b/cognee/modules/users/methods/get_authenticated_user.py @@ -13,7 +13,7 @@ logger = get_logger("get_authenticated_user") # Check environment variable to determine authentication requirement REQUIRE_AUTHENTICATION = ( os.getenv("REQUIRE_AUTHENTICATION", "false").lower() == "true" - or check_backend_access_control_mode() == "true" + or check_backend_access_control_mode() ) fastapi_users = get_fastapi_users() From d1581e9ebab143930acf1cec404dda083fb06a9f Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Wed, 29 Oct 2025 17:36:56 +0100 Subject: [PATCH 09/28] refactor: disable permissions for code graph example --- examples/python/code_graph_example.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/examples/python/code_graph_example.py b/examples/python/code_graph_example.py index 431069050..1b476a2c3 100644 --- a/examples/python/code_graph_example.py +++ b/examples/python/code_graph_example.py @@ -1,5 +1,7 @@ import argparse import asyncio +import os + import cognee from cognee import SearchType from cognee.shared.logging_utils import setup_logging, ERROR @@ -8,6 +10,9 @@ from cognee.api.v1.cognify.code_graph_pipeline import run_code_graph_pipeline async def main(repo_path, include_docs): + # Disable permissions feature for this example + os.environ["ENABLE_BACKEND_ACCESS_CONTROL"] = "false" + run_status = False async for run_status in run_code_graph_pipeline(repo_path, include_docs=include_docs): run_status = run_status From eec96e4f1fb30e692b6e448aa9b1e553c0fead98 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Wed, 29 Oct 2025 19:14:53 +0100 Subject: [PATCH 10/28] refactor: fix search result for library test --- cognee/tests/test_library.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/cognee/tests/test_library.py b/cognee/tests/test_library.py index 81f81ee61..893b836c0 100755 --- a/cognee/tests/test_library.py +++ b/cognee/tests/test_library.py @@ -90,15 +90,17 @@ async def main(): ) search_results = await cognee.search( - query_type=SearchType.GRAPH_COMPLETION, query_text="What information do you contain?" + query_type=SearchType.GRAPH_COMPLETION, + query_text="What information do you contain?", + dataset_ids=[pipeline_run_obj.dataset_id], ) - assert "Mark" in search_results[0], ( + assert "Mark" in search_results[0]["search_result"][0], ( "Failed to update document, no mention of Mark in search results" ) - assert "Cindy" in search_results[0], ( + assert "Cindy" in search_results[0]["search_result"][0], ( "Failed to update document, no mention of Cindy in search results" ) - assert "Artificial intelligence" not in search_results[0], ( + assert "Artificial intelligence" not in search_results[0]["search_result"][0], ( "Failed to update document, Artificial intelligence still mentioned in search results" ) From ee0ecd52d8115396523c1a62b2c99b3178f5d182 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Thu, 30 Oct 2025 16:25:34 +0100 Subject: [PATCH 11/28] refactor: Rewrite tests to work with multi-user mode by default --- .../users/test_conditional_authentication.py | 63 ------------------- 1 file changed, 63 deletions(-) diff --git a/cognee/tests/unit/modules/users/test_conditional_authentication.py b/cognee/tests/unit/modules/users/test_conditional_authentication.py index c4368d796..6568c3cb0 100644 --- a/cognee/tests/unit/modules/users/test_conditional_authentication.py +++ b/cognee/tests/unit/modules/users/test_conditional_authentication.py @@ -107,29 +107,10 @@ class TestConditionalAuthenticationIntegration: # REQUIRE_AUTHENTICATION should be a boolean assert isinstance(REQUIRE_AUTHENTICATION, bool) - # Currently should be False (optional authentication) - assert not REQUIRE_AUTHENTICATION - class TestConditionalAuthenticationEnvironmentVariables: """Test environment variable handling.""" - def test_require_authentication_default_false(self): - """Test that REQUIRE_AUTHENTICATION defaults to false when imported with no env vars.""" - with patch.dict(os.environ, {}, clear=True): - # Remove module from cache to force fresh import - module_name = "cognee.modules.users.methods.get_authenticated_user" - if module_name in sys.modules: - del sys.modules[module_name] - - # Import after patching environment - module will see empty environment - from cognee.modules.users.methods.get_authenticated_user import ( - REQUIRE_AUTHENTICATION, - ) - - importlib.invalidate_caches() - assert not REQUIRE_AUTHENTICATION - def test_require_authentication_true(self): """Test that REQUIRE_AUTHENTICATION=true is parsed correctly when imported.""" with patch.dict(os.environ, {"REQUIRE_AUTHENTICATION": "true"}): @@ -145,50 +126,6 @@ class TestConditionalAuthenticationEnvironmentVariables: assert REQUIRE_AUTHENTICATION - def test_require_authentication_false_explicit(self): - """Test that REQUIRE_AUTHENTICATION=false is parsed correctly when imported.""" - with patch.dict(os.environ, {"REQUIRE_AUTHENTICATION": "false"}): - # Remove module from cache to force fresh import - module_name = "cognee.modules.users.methods.get_authenticated_user" - if module_name in sys.modules: - del sys.modules[module_name] - - # Import after patching environment - module will see REQUIRE_AUTHENTICATION=false - from cognee.modules.users.methods.get_authenticated_user import ( - REQUIRE_AUTHENTICATION, - ) - - assert not REQUIRE_AUTHENTICATION - - def test_require_authentication_case_insensitive(self): - """Test that environment variable parsing is case insensitive when imported.""" - test_cases = ["TRUE", "True", "tRuE", "FALSE", "False", "fAlSe"] - - for case in test_cases: - with patch.dict(os.environ, {"REQUIRE_AUTHENTICATION": case}): - # Remove module from cache to force fresh import - module_name = "cognee.modules.users.methods.get_authenticated_user" - if module_name in sys.modules: - del sys.modules[module_name] - - # Import after patching environment - from cognee.modules.users.methods.get_authenticated_user import ( - REQUIRE_AUTHENTICATION, - ) - - expected = case.lower() == "true" - assert REQUIRE_AUTHENTICATION == expected, f"Failed for case: {case}" - - def test_current_require_authentication_value(self): - """Test that the current REQUIRE_AUTHENTICATION module value is as expected.""" - from cognee.modules.users.methods.get_authenticated_user import ( - REQUIRE_AUTHENTICATION, - ) - - # The module-level variable should currently be False (set at import time) - assert isinstance(REQUIRE_AUTHENTICATION, bool) - assert not REQUIRE_AUTHENTICATION - class TestConditionalAuthenticationEdgeCases: """Test edge cases and error scenarios.""" From 9d8430cfb08e17467390ff53e57d330885c6c7d9 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Thu, 30 Oct 2025 16:52:04 +0100 Subject: [PATCH 12/28] refactor: Update unit tests for require auth --- .../test_conditional_authentication_endpoints.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/cognee/tests/unit/api/test_conditional_authentication_endpoints.py b/cognee/tests/unit/api/test_conditional_authentication_endpoints.py index 2eabee91a..6cc37ef38 100644 --- a/cognee/tests/unit/api/test_conditional_authentication_endpoints.py +++ b/cognee/tests/unit/api/test_conditional_authentication_endpoints.py @@ -1,3 +1,4 @@ +import os import pytest from unittest.mock import patch, AsyncMock, MagicMock from uuid import uuid4 @@ -5,8 +6,6 @@ from fastapi.testclient import TestClient from types import SimpleNamespace import importlib -from cognee.api.client import app - # Fixtures for reuse across test classes @pytest.fixture @@ -32,6 +31,10 @@ def mock_authenticated_user(): ) +# To turn off authentication we need to set the environment variable before importing the module +# Also both require_authentication and backend access control must be false +os.environ["REQUIRE_AUTHENTICATION"] = "false" +os.environ["ENABLE_BACKEND_ACCESS_CONTROL"] = "false" gau_mod = importlib.import_module("cognee.modules.users.methods.get_authenticated_user") @@ -40,6 +43,8 @@ class TestConditionalAuthenticationEndpoints: @pytest.fixture def client(self): + from cognee.api.client import app + """Create a test client.""" return TestClient(app) @@ -133,6 +138,8 @@ class TestConditionalAuthenticationBehavior: @pytest.fixture def client(self): + from cognee.api.client import app + return TestClient(app) @pytest.mark.parametrize( @@ -209,6 +216,8 @@ class TestConditionalAuthenticationErrorHandling: @pytest.fixture def client(self): + from cognee.api.client import app + return TestClient(app) @patch.object(gau_mod, "get_default_user", new_callable=AsyncMock) @@ -232,7 +241,7 @@ class TestConditionalAuthenticationErrorHandling: # The exact error message may vary depending on the actual database connection # The important thing is that we get a 500 error when user creation fails - def test_current_environment_configuration(self): + def test_current_environment_configuration(self, client): """Test that current environment configuration is working properly.""" # This tests the actual module state without trying to change it from cognee.modules.users.methods.get_authenticated_user import ( From e061f34a28bf9cd27453ea6aac8abf215c7bde8f Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Thu, 30 Oct 2025 17:13:10 +0100 Subject: [PATCH 13/28] fix: Resolve issue with dataset names for example --- examples/python/feedback_enrichment_minimal_example.py | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/python/feedback_enrichment_minimal_example.py b/examples/python/feedback_enrichment_minimal_example.py index 11ef20830..8954bd5f6 100644 --- a/examples/python/feedback_enrichment_minimal_example.py +++ b/examples/python/feedback_enrichment_minimal_example.py @@ -67,7 +67,6 @@ async def run_feedback_enrichment_memify(last_n: int = 5): extraction_tasks=extraction_tasks, enrichment_tasks=enrichment_tasks, data=[{}], # A placeholder to prevent fetching the entire graph - dataset="feedback_enrichment_minimal", ) From 45bb3130c695260af9ccb00e756a0b4d22f0a85b Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Thu, 30 Oct 2025 17:40:00 +0100 Subject: [PATCH 14/28] fix: Use same dataset name accross cognee calls --- cognee/tests/test_feedback_enrichment.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cognee/tests/test_feedback_enrichment.py b/cognee/tests/test_feedback_enrichment.py index 02d90db32..378cb0e45 100644 --- a/cognee/tests/test_feedback_enrichment.py +++ b/cognee/tests/test_feedback_enrichment.py @@ -133,7 +133,7 @@ async def main(): extraction_tasks=extraction_tasks, enrichment_tasks=enrichment_tasks, data=[{}], - dataset="feedback_enrichment_test_memify", + dataset=dataset_name, ) nodes_after, edges_after = await graph_engine.get_graph_data() From 1b483276b0077c57eefdc7e7aa93d58501cf7840 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Thu, 30 Oct 2025 18:04:27 +0100 Subject: [PATCH 15/28] fix: disable backend access control for rel db test --- cognee/tests/test_relational_db_migration.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cognee/tests/test_relational_db_migration.py b/cognee/tests/test_relational_db_migration.py index 2b69ce854..4557e9e2f 100644 --- a/cognee/tests/test_relational_db_migration.py +++ b/cognee/tests/test_relational_db_migration.py @@ -27,6 +27,9 @@ def normalize_node_name(node_name: str) -> str: async def setup_test_db(): + # Disable backend access control to migrate relational data + os.environ["ENABLE_BACKEND_ACCESS_CONTROL"] = "false" + await cognee.prune.prune_data() await cognee.prune.prune_system(metadata=True) From 3c09433adead92f8a09093069a6d88de63c28409 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Fri, 31 Oct 2025 13:57:12 +0100 Subject: [PATCH 16/28] fix: Resolve docling test --- cognee/tests/test_add_docling_document.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cognee/tests/test_add_docling_document.py b/cognee/tests/test_add_docling_document.py index 2c82af66f..c5aa4e9d1 100644 --- a/cognee/tests/test_add_docling_document.py +++ b/cognee/tests/test_add_docling_document.py @@ -39,12 +39,12 @@ async def main(): answer = await cognee.search("Do programmers change light bulbs?") assert len(answer) != 0 - lowercase_answer = answer[0].lower() + lowercase_answer = answer[0]["search_result"][0].lower() assert ("no" in lowercase_answer) or ("none" in lowercase_answer) answer = await cognee.search("What colours are there in the presentation table?") assert len(answer) != 0 - lowercase_answer = answer[0].lower() + lowercase_answer = answer[0]["search_result"][0].lower() assert ( ("red" in lowercase_answer) and ("blue" in lowercase_answer) From 00a1fe71d76ae62deac5832751366061f21bc96f Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Fri, 31 Oct 2025 14:33:07 +0100 Subject: [PATCH 17/28] fix: Use multi-user mode search --- examples/python/agentic_reasoning_procurement_example.py | 2 +- examples/python/memify_coding_agent_example.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/python/agentic_reasoning_procurement_example.py b/examples/python/agentic_reasoning_procurement_example.py index 5aa3caa70..4e9d2d7e4 100644 --- a/examples/python/agentic_reasoning_procurement_example.py +++ b/examples/python/agentic_reasoning_procurement_example.py @@ -168,7 +168,7 @@ async def run_procurement_example(): for q in questions: print(f"Question: \n{q}") results = await procurement_system.search_memory(q, search_categories=[category]) - top_answer = results[category][0] + top_answer = results[category][0]["search_result"][0] print(f"Answer: \n{top_answer.strip()}\n") research_notes[category].append({"question": q, "answer": top_answer}) diff --git a/examples/python/memify_coding_agent_example.py b/examples/python/memify_coding_agent_example.py index 1fd3b1528..4a087ba61 100644 --- a/examples/python/memify_coding_agent_example.py +++ b/examples/python/memify_coding_agent_example.py @@ -89,7 +89,7 @@ async def main(): ) print("Coding rules created by memify:") - for coding_rule in coding_rules: + for coding_rule in coding_rules[0]["search_result"][0]: print("- " + coding_rule) # Visualize new graph with added memify context From 4c8b8211979fc12b6e46a911eba1c2e2610187d8 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Fri, 31 Oct 2025 14:55:52 +0100 Subject: [PATCH 18/28] fix: resolve test failing --- cognee/tests/test_search_db.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cognee/tests/test_search_db.py b/cognee/tests/test_search_db.py index e24abd0f5..bcc4529a9 100644 --- a/cognee/tests/test_search_db.py +++ b/cognee/tests/test_search_db.py @@ -146,7 +146,7 @@ async def main(): assert len(search_results) == 1, ( f"{name}: expected single-element list, got {len(search_results)}" ) - text = search_results[0] + text = search_results[0]["search_result"][0] assert isinstance(text, str), f"{name}: element should be a string" assert text.strip(), f"{name}: string should not be empty" assert "netherlands" in text.lower(), ( From f368a1a4d5d79a4485bc52ff5d9f16fc909942d2 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Fri, 31 Oct 2025 20:10:05 +0100 Subject: [PATCH 19/28] fix: set tests to not use multi-user mode --- .github/workflows/search_db_tests.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/search_db_tests.yml b/.github/workflows/search_db_tests.yml index e3e46dd97..118c1c06c 100644 --- a/.github/workflows/search_db_tests.yml +++ b/.github/workflows/search_db_tests.yml @@ -84,6 +84,7 @@ jobs: GRAPH_DATABASE_PROVIDER: 'neo4j' VECTOR_DB_PROVIDER: 'lancedb' DB_PROVIDER: 'sqlite' + ENABLE_BACKEND_ACCESS_CONTROL: 'false' GRAPH_DATABASE_URL: ${{ steps.neo4j.outputs.neo4j-url }} GRAPH_DATABASE_USERNAME: ${{ steps.neo4j.outputs.neo4j-username }} GRAPH_DATABASE_PASSWORD: ${{ steps.neo4j.outputs.neo4j-password }} @@ -135,6 +136,7 @@ jobs: EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} GRAPH_DATABASE_PROVIDER: 'kuzu' VECTOR_DB_PROVIDER: 'pgvector' + ENABLE_BACKEND_ACCESS_CONTROL: 'false' DB_PROVIDER: 'postgres' DB_NAME: 'cognee_db' DB_HOST: '127.0.0.1' @@ -197,6 +199,7 @@ jobs: GRAPH_DATABASE_URL: ${{ steps.neo4j.outputs.neo4j-url }} GRAPH_DATABASE_USERNAME: ${{ steps.neo4j.outputs.neo4j-username }} GRAPH_DATABASE_PASSWORD: ${{ steps.neo4j.outputs.neo4j-password }} + ENABLE_BACKEND_ACCESS_CONTROL: 'false' DB_NAME: cognee_db DB_HOST: 127.0.0.1 DB_PORT: 5432 From 2ab2cffd07ed35a268362af48c2fb668674509f1 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Mon, 3 Nov 2025 16:37:03 +0100 Subject: [PATCH 20/28] chore: update test_search_db to work with all graph providers --- cognee/tests/test_search_db.py | 8 +++++++- examples/python/simple_example.py | 8 -------- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/cognee/tests/test_search_db.py b/cognee/tests/test_search_db.py index bcc4529a9..ea3f0ea44 100644 --- a/cognee/tests/test_search_db.py +++ b/cognee/tests/test_search_db.py @@ -146,7 +146,13 @@ async def main(): assert len(search_results) == 1, ( f"{name}: expected single-element list, got {len(search_results)}" ) - text = search_results[0]["search_result"][0] + + from cognee.context_global_variables import check_backend_access_control_mode + + if check_backend_access_control_mode(): + text = search_results[0]["search_result"][0] + else: + text = search_results[0] assert isinstance(text, str), f"{name}: element should be a string" assert text.strip(), f"{name}: string should not be empty" assert "netherlands" in text.lower(), ( diff --git a/examples/python/simple_example.py b/examples/python/simple_example.py index c13e48f85..237a8295e 100644 --- a/examples/python/simple_example.py +++ b/examples/python/simple_example.py @@ -59,14 +59,6 @@ async def main(): for result_text in search_results: print(result_text) - # Example output: - # ({'id': UUID('bc338a39-64d6-549a-acec-da60846dd90d'), 'updated_at': datetime.datetime(2024, 11, 21, 12, 23, 1, 211808, tzinfo=datetime.timezone.utc), 'name': 'natural language processing', 'description': 'An interdisciplinary subfield of computer science and information retrieval.'}, {'relationship_name': 'is_a_subfield_of', 'source_node_id': UUID('bc338a39-64d6-549a-acec-da60846dd90d'), 'target_node_id': UUID('6218dbab-eb6a-5759-a864-b3419755ffe0'), 'updated_at': datetime.datetime(2024, 11, 21, 12, 23, 15, 473137, tzinfo=datetime.timezone.utc)}, {'id': UUID('6218dbab-eb6a-5759-a864-b3419755ffe0'), 'updated_at': datetime.datetime(2024, 11, 21, 12, 23, 1, 211808, tzinfo=datetime.timezone.utc), 'name': 'computer science', 'description': 'The study of computation and information processing.'}) - # (...) - # It represents nodes and relationships in the knowledge graph: - # - The first element is the source node (e.g., 'natural language processing'). - # - The second element is the relationship between nodes (e.g., 'is_a_subfield_of'). - # - The third element is the target node (e.g., 'computer science'). - if __name__ == "__main__": logger = setup_logging(log_level=ERROR) From 4424bdc76471342119d59943c38d392dac9d72b0 Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Mon, 3 Nov 2025 17:06:51 +0100 Subject: [PATCH 21/28] test: fix path based on pr comment --- cognee/tests/test_load.py | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/cognee/tests/test_load.py b/cognee/tests/test_load.py index a09ce053d..b38466bc7 100644 --- a/cognee/tests/test_load.py +++ b/cognee/tests/test_load.py @@ -30,17 +30,10 @@ async def process_and_search(num_of_searches): async def main(): - data_directory_path = str( - pathlib.Path( - os.path.join(pathlib.Path(__file__).parent, ".data_storage/test_load") - ).resolve() - ) + data_directory_path = os.path.join(pathlib.Path(__file__).parent, ".data_storage/test_load") cognee.config.data_root_directory(data_directory_path) - cognee_directory_path = str( - pathlib.Path( - os.path.join(pathlib.Path(__file__).parent, ".cognee_system/test_load") - ).resolve() - ) + + cognee_directory_path = os.path.join(pathlib.Path(__file__).parent, ".cognee_system/test_load") cognee.config.system_root_directory(cognee_directory_path) num_of_pdfs = 10 From eb8df45dab2cb7a07de7eb97570d364790f80080 Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Mon, 3 Nov 2025 18:10:19 +0100 Subject: [PATCH 22/28] test: increase file descriptor limit on workflow load test --- .github/workflows/e2e_tests.yml | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/.github/workflows/e2e_tests.yml b/.github/workflows/e2e_tests.yml index cf704c76a..79df3ff6b 100644 --- a/.github/workflows/e2e_tests.yml +++ b/.github/workflows/e2e_tests.yml @@ -463,6 +463,12 @@ jobs: with: python-version: '3.11.x' + - name: Set File Descriptor Limit + run: sudo prlimit --pid $$ --nofile=4096:4096 + + - name: Verify File Descriptor Limit + run: ulimit -n + - name: Dependencies already installed run: echo "Dependencies already installed in setup" @@ -478,4 +484,9 @@ jobs: EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} + STORAGE_BACKEND: s3 + AWS_REGION: eu-west-1 + AWS_ENDPOINT_URL: https://s3-eu-west-1.amazonaws.com + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_S3_DEV_USER_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_S3_DEV_USER_SECRET_KEY }} run: uv run python ./cognee/tests/test_load.py \ No newline at end of file From a7d63df98c7ad21a40679c1b821acb30690c65d8 Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Mon, 3 Nov 2025 18:15:18 +0100 Subject: [PATCH 23/28] test: add extra aws dependency to load test --- .github/workflows/e2e_tests.yml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/e2e_tests.yml b/.github/workflows/e2e_tests.yml index 79df3ff6b..0596f22d3 100644 --- a/.github/workflows/e2e_tests.yml +++ b/.github/workflows/e2e_tests.yml @@ -267,8 +267,6 @@ jobs: EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} run: uv run python ./cognee/tests/test_edge_ingestion.py - - run_concurrent_subprocess_access_test: name: Concurrent Subprocess access test runs-on: ubuntu-latest @@ -450,7 +448,6 @@ jobs: DB_PASSWORD: cognee run: uv run python ./cognee/tests/test_conversation_history.py - test-load: name: Test Load runs-on: ubuntu-22.04 @@ -462,6 +459,7 @@ jobs: uses: ./.github/actions/cognee_setup with: python-version: '3.11.x' + extra-dependencies: "aws" - name: Set File Descriptor Limit run: sudo prlimit --pid $$ --nofile=4096:4096 From c81d06d364d3b1f114fb2e7e81db856e7389386d Mon Sep 17 00:00:00 2001 From: Igor Ilic <30923996+dexters1@users.noreply.github.com> Date: Mon, 3 Nov 2025 19:37:52 +0100 Subject: [PATCH 24/28] Update cognee/context_global_variables.py Co-authored-by: Pavel Zorin --- cognee/context_global_variables.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/cognee/context_global_variables.py b/cognee/context_global_variables.py index b4b848192..6f3965441 100644 --- a/cognee/context_global_variables.py +++ b/cognee/context_global_variables.py @@ -41,11 +41,7 @@ def check_backend_access_control_mode(): if backend_access_control is None: # If backend access control is not defined in environment variables, # enable it by default if graph and vector DBs can support it, otherwise disable it - multi_user_support = check_multi_user_support() - if multi_user_support: - return True - else: - return False + return check_multi_user_support() elif backend_access_control.lower() == "true": # If enabled, ensure that the current graph and vector DBs can support it multi_user_support = check_multi_user_support() From 53521c2068319d340c3f2b396dbbc7f3f9c80523 Mon Sep 17 00:00:00 2001 From: Igor Ilic <30923996+dexters1@users.noreply.github.com> Date: Mon, 3 Nov 2025 19:42:51 +0100 Subject: [PATCH 25/28] Update cognee/context_global_variables.py Co-authored-by: Pavel Zorin --- cognee/context_global_variables.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/cognee/context_global_variables.py b/cognee/context_global_variables.py index 6f3965441..3afbf6ff2 100644 --- a/cognee/context_global_variables.py +++ b/cognee/context_global_variables.py @@ -27,13 +27,10 @@ async def set_session_user_context_variable(user): def check_multi_user_support(): graph_db_config = get_graph_context_config() vector_db_config = get_vectordb_context_config() - if ( + return ( graph_db_config["graph_database_provider"] in graph_dbs_with_multi_user_support and vector_db_config["vector_db_provider"] in vector_dbs_with_multi_user_support - ): - return True - else: - return False + ) def check_backend_access_control_mode(): From 46c509778f89d5bebbcbe5f7578159e45841ca2d Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Tue, 4 Nov 2025 12:06:16 +0100 Subject: [PATCH 26/28] refactor: Rename access control functions --- cognee/context_global_variables.py | 17 +++++++---------- cognee/modules/search/methods/search.py | 6 +++--- .../users/methods/get_authenticated_user.py | 4 ++-- cognee/tests/test_search_db.py | 4 ++-- 4 files changed, 14 insertions(+), 17 deletions(-) diff --git a/cognee/context_global_variables.py b/cognee/context_global_variables.py index 3afbf6ff2..f17c9187a 100644 --- a/cognee/context_global_variables.py +++ b/cognee/context_global_variables.py @@ -24,7 +24,7 @@ async def set_session_user_context_variable(user): session_user.set(user) -def check_multi_user_support(): +def multi_user_support_possible(): graph_db_config = get_graph_context_config() vector_db_config = get_vectordb_context_config() return ( @@ -33,24 +33,21 @@ def check_multi_user_support(): ) -def check_backend_access_control_mode(): +def backend_access_control_enabled(): backend_access_control = os.environ.get("ENABLE_BACKEND_ACCESS_CONTROL", None) if backend_access_control is None: # If backend access control is not defined in environment variables, # enable it by default if graph and vector DBs can support it, otherwise disable it - return check_multi_user_support() + return multi_user_support_possible() elif backend_access_control.lower() == "true": # If enabled, ensure that the current graph and vector DBs can support it - multi_user_support = check_multi_user_support() + multi_user_support = multi_user_support_possible() if not multi_user_support: raise EnvironmentError( "ENABLE_BACKEND_ACCESS_CONTROL is set to true but the current graph and/or vector databases do not support multi-user access control. Please use supported databases or disable backend access control." ) - else: - return True - else: - # If explicitly disabled, return false - return False + return True + return False async def set_database_global_context_variables(dataset: Union[str, UUID], user_id: UUID): @@ -74,7 +71,7 @@ async def set_database_global_context_variables(dataset: Union[str, UUID], user_ base_config = get_base_config() - if not check_backend_access_control_mode(): + if not backend_access_control_enabled(): return user = await get_user(user_id) diff --git a/cognee/modules/search/methods/search.py b/cognee/modules/search/methods/search.py index 4a67093e8..5e465b239 100644 --- a/cognee/modules/search/methods/search.py +++ b/cognee/modules/search/methods/search.py @@ -8,7 +8,7 @@ from cognee.infrastructure.databases.graph import get_graph_engine from cognee.shared.logging_utils import get_logger from cognee.shared.utils import send_telemetry from cognee.context_global_variables import set_database_global_context_variables -from cognee.context_global_variables import check_backend_access_control_mode +from cognee.context_global_variables import backend_access_control_enabled from cognee.modules.engine.models.node_set import NodeSet from cognee.modules.graph.cognee_graph.CogneeGraphElements import Edge @@ -74,7 +74,7 @@ async def search( ) # Use search function filtered by permissions if access control is enabled - if check_backend_access_control_mode(): + if backend_access_control_enabled(): search_results = await authorized_search( query_type=query_type, query_text=query_text, @@ -156,7 +156,7 @@ async def search( ) else: # This is for maintaining backwards compatibility - if check_backend_access_control_mode(): + if backend_access_control_enabled(): return_value = [] for search_result in search_results: prepared_search_results = await prepare_search_result(search_result) diff --git a/cognee/modules/users/methods/get_authenticated_user.py b/cognee/modules/users/methods/get_authenticated_user.py index 3cc16f3a8..d6d701737 100644 --- a/cognee/modules/users/methods/get_authenticated_user.py +++ b/cognee/modules/users/methods/get_authenticated_user.py @@ -5,7 +5,7 @@ from ..models import User from ..get_fastapi_users import get_fastapi_users from .get_default_user import get_default_user from cognee.shared.logging_utils import get_logger -from cognee.context_global_variables import check_backend_access_control_mode +from cognee.context_global_variables import backend_access_control_enabled logger = get_logger("get_authenticated_user") @@ -13,7 +13,7 @@ logger = get_logger("get_authenticated_user") # Check environment variable to determine authentication requirement REQUIRE_AUTHENTICATION = ( os.getenv("REQUIRE_AUTHENTICATION", "false").lower() == "true" - or check_backend_access_control_mode() + or backend_access_control_enabled() ) fastapi_users = get_fastapi_users() diff --git a/cognee/tests/test_search_db.py b/cognee/tests/test_search_db.py index ea3f0ea44..bd11dc62e 100644 --- a/cognee/tests/test_search_db.py +++ b/cognee/tests/test_search_db.py @@ -147,9 +147,9 @@ async def main(): f"{name}: expected single-element list, got {len(search_results)}" ) - from cognee.context_global_variables import check_backend_access_control_mode + from cognee.context_global_variables import backend_access_control_enabled - if check_backend_access_control_mode(): + if backend_access_control_enabled(): text = search_results[0]["search_result"][0] else: text = search_results[0] From c0e5ce04cedbf3bc4511e73ed2e2276e21c2f978 Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Thu, 6 Nov 2025 14:13:55 +0100 Subject: [PATCH 27/28] Fix: fixes session history test for multiuser mode (#1746) ## Description Fixes failing session history test ## Type of Change - [x] Bug fix (non-breaking change that fixes an issue) - [ ] New feature (non-breaking change that adds functionality) - [ ] Breaking change (fix or feature that would cause existing functionality to change) - [ ] Documentation update - [ ] Code refactoring - [ ] Performance improvement - [ ] Other (please specify): ## Screenshots/Videos (if applicable) ## Pre-submission Checklist - [x] **I have tested my changes thoroughly before submitting this PR** - [x] **This PR contains minimal changes necessary to address the issue/feature** - [x] My code follows the project's coding standards and style guidelines - [x] I have added tests that prove my fix is effective or that my feature works - [x] I have added necessary documentation (if applicable) - [x] All new and existing tests pass - [x] I have searched existing PRs to ensure this change hasn't been submitted already - [x] I have linked any relevant issues in the description - [x] My commits have clear and descriptive messages ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin. --- .../persist_sessions_in_knowledge_graph.py | 2 +- cognee/tasks/memify/cognify_session.py | 7 ++++--- cognee/tests/test_conversation_history.py | 6 +++--- .../modules/memify_tasks/test_cognify_session.py | 12 ++++++++---- 4 files changed, 16 insertions(+), 11 deletions(-) diff --git a/cognee/memify_pipelines/persist_sessions_in_knowledge_graph.py b/cognee/memify_pipelines/persist_sessions_in_knowledge_graph.py index c0ba0a4d9..92d64c156 100644 --- a/cognee/memify_pipelines/persist_sessions_in_knowledge_graph.py +++ b/cognee/memify_pipelines/persist_sessions_in_knowledge_graph.py @@ -40,7 +40,7 @@ async def persist_sessions_in_knowledge_graph_pipeline( extraction_tasks = [Task(extract_user_sessions, session_ids=session_ids)] enrichment_tasks = [ - Task(cognify_session), + Task(cognify_session, dataset_id=dataset_to_write[0].id), ] result = await memify( diff --git a/cognee/tasks/memify/cognify_session.py b/cognee/tasks/memify/cognify_session.py index 7c276169a..f53f9afb1 100644 --- a/cognee/tasks/memify/cognify_session.py +++ b/cognee/tasks/memify/cognify_session.py @@ -6,7 +6,7 @@ from cognee.shared.logging_utils import get_logger logger = get_logger("cognify_session") -async def cognify_session(data): +async def cognify_session(data, dataset_id=None): """ Process and cognify session data into the knowledge graph. @@ -16,6 +16,7 @@ async def cognify_session(data): Args: data: Session string containing Question, Context, and Answer information. + dataset_name: Name of dataset. Raises: CogneeValidationError: If data is None or empty. @@ -28,9 +29,9 @@ async def cognify_session(data): logger.info("Processing session data for cognification") - await cognee.add(data, node_set=["user_sessions_from_cache"]) + await cognee.add(data, dataset_id=dataset_id, node_set=["user_sessions_from_cache"]) logger.debug("Session data added to cognee with node_set: user_sessions") - await cognee.cognify() + await cognee.cognify(datasets=[dataset_id]) logger.info("Session data successfully cognified") except CogneeValidationError: diff --git a/cognee/tests/test_conversation_history.py b/cognee/tests/test_conversation_history.py index 6b5b737f1..783baf563 100644 --- a/cognee/tests/test_conversation_history.py +++ b/cognee/tests/test_conversation_history.py @@ -56,10 +56,10 @@ async def main(): """DataCo is a data analytics company. They help businesses make sense of their data.""" ) - await cognee.add(text_1, dataset_name) - await cognee.add(text_2, dataset_name) + await cognee.add(data=text_1, dataset_name=dataset_name) + await cognee.add(data=text_2, dataset_name=dataset_name) - await cognee.cognify([dataset_name]) + await cognee.cognify(datasets=[dataset_name]) user = await get_default_user() diff --git a/cognee/tests/unit/modules/memify_tasks/test_cognify_session.py b/cognee/tests/unit/modules/memify_tasks/test_cognify_session.py index c23640fbd..8c2448287 100644 --- a/cognee/tests/unit/modules/memify_tasks/test_cognify_session.py +++ b/cognee/tests/unit/modules/memify_tasks/test_cognify_session.py @@ -16,9 +16,11 @@ async def test_cognify_session_success(): patch("cognee.add", new_callable=AsyncMock) as mock_add, patch("cognee.cognify", new_callable=AsyncMock) as mock_cognify, ): - await cognify_session(session_data) + await cognify_session(session_data, dataset_id="123") - mock_add.assert_called_once_with(session_data, node_set=["user_sessions_from_cache"]) + mock_add.assert_called_once_with( + session_data, dataset_id="123", node_set=["user_sessions_from_cache"] + ) mock_cognify.assert_called_once() @@ -101,7 +103,9 @@ async def test_cognify_session_with_special_characters(): patch("cognee.add", new_callable=AsyncMock) as mock_add, patch("cognee.cognify", new_callable=AsyncMock) as mock_cognify, ): - await cognify_session(session_data) + await cognify_session(session_data, dataset_id="123") - mock_add.assert_called_once_with(session_data, node_set=["user_sessions_from_cache"]) + mock_add.assert_called_once_with( + session_data, dataset_id="123", node_set=["user_sessions_from_cache"] + ) mock_cognify.assert_called_once() From 5bc83968f81982ad90f37e41c205c90bb1b9b3d5 Mon Sep 17 00:00:00 2001 From: lxobr <122801072+lxobr@users.noreply.github.com> Date: Thu, 6 Nov 2025 15:22:48 +0100 Subject: [PATCH 28/28] feature: text chunker with overlap (#1732) ## Description - Implements `TextChunkerWithOverlap` with configurable `chunk_overlap_ratio` - Abstracts chunk_data generation via `get_chunk_data` callable (defaults to `chunk_by_paragraph`) - Parametrized tests verify `TextChunker` and `TextChunkerWithOverlap` (0% overlap) produce identical output for all edge cases. - Overlap-specific tests validate `TextChunkerWithOverlap` behavior ## Type of Change - [ ] Bug fix (non-breaking change that fixes an issue) - [x] New feature (non-breaking change that adds functionality) - [ ] Breaking change (fix or feature that would cause existing functionality to change) - [ ] Documentation update - [ ] Code refactoring - [ ] Performance improvement - [ ] Other (please specify): ## Screenshots/Videos (if applicable) ## Pre-submission Checklist - [x] **I have tested my changes thoroughly before submitting this PR** - [x] **This PR contains minimal changes necessary to address the issue/feature** - [x] My code follows the project's coding standards and style guidelines - [x] I have added tests that prove my fix is effective or that my feature works - [ ] I have added necessary documentation (if applicable) - [x] All new and existing tests pass - [x] I have searched existing PRs to ensure this change hasn't been submitted already - [ ] I have linked any relevant issues in the description - [x] My commits have clear and descriptive messages ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin. --------- Co-authored-by: hajdul88 <52442977+hajdul88@users.noreply.github.com> --- .../chunking/text_chunker_with_overlap.py | 124 +++++++ .../modules/chunking/test_text_chunker.py | 248 ++++++++++++++ .../test_text_chunker_with_overlap.py | 324 ++++++++++++++++++ 3 files changed, 696 insertions(+) create mode 100644 cognee/modules/chunking/text_chunker_with_overlap.py create mode 100644 cognee/tests/unit/modules/chunking/test_text_chunker.py create mode 100644 cognee/tests/unit/modules/chunking/test_text_chunker_with_overlap.py diff --git a/cognee/modules/chunking/text_chunker_with_overlap.py b/cognee/modules/chunking/text_chunker_with_overlap.py new file mode 100644 index 000000000..4b9c23079 --- /dev/null +++ b/cognee/modules/chunking/text_chunker_with_overlap.py @@ -0,0 +1,124 @@ +from cognee.shared.logging_utils import get_logger +from uuid import NAMESPACE_OID, uuid5 + +from cognee.tasks.chunks import chunk_by_paragraph +from cognee.modules.chunking.Chunker import Chunker +from .models.DocumentChunk import DocumentChunk + +logger = get_logger() + + +class TextChunkerWithOverlap(Chunker): + def __init__( + self, + document, + get_text: callable, + max_chunk_size: int, + chunk_overlap_ratio: float = 0.0, + get_chunk_data: callable = None, + ): + super().__init__(document, get_text, max_chunk_size) + self._accumulated_chunk_data = [] + self._accumulated_size = 0 + self.chunk_overlap_ratio = chunk_overlap_ratio + self.chunk_overlap = int(max_chunk_size * chunk_overlap_ratio) + + if get_chunk_data is not None: + self.get_chunk_data = get_chunk_data + elif chunk_overlap_ratio > 0: + paragraph_max_size = int(0.5 * chunk_overlap_ratio * max_chunk_size) + self.get_chunk_data = lambda text: chunk_by_paragraph( + text, paragraph_max_size, batch_paragraphs=True + ) + else: + self.get_chunk_data = lambda text: chunk_by_paragraph( + text, self.max_chunk_size, batch_paragraphs=True + ) + + def _accumulation_overflows(self, chunk_data): + """Check if adding chunk_data would exceed max_chunk_size.""" + return self._accumulated_size + chunk_data["chunk_size"] > self.max_chunk_size + + def _accumulate_chunk_data(self, chunk_data): + """Add chunk_data to the current accumulation.""" + self._accumulated_chunk_data.append(chunk_data) + self._accumulated_size += chunk_data["chunk_size"] + + def _clear_accumulation(self): + """Reset accumulation, keeping overlap chunk_data based on chunk_overlap_ratio.""" + if self.chunk_overlap == 0: + self._accumulated_chunk_data = [] + self._accumulated_size = 0 + return + + # Keep chunk_data from the end that fit in overlap + overlap_chunk_data = [] + overlap_size = 0 + + for chunk_data in reversed(self._accumulated_chunk_data): + if overlap_size + chunk_data["chunk_size"] <= self.chunk_overlap: + overlap_chunk_data.insert(0, chunk_data) + overlap_size += chunk_data["chunk_size"] + else: + break + + self._accumulated_chunk_data = overlap_chunk_data + self._accumulated_size = overlap_size + + def _create_chunk(self, text, size, cut_type, chunk_id=None): + """Create a DocumentChunk with standard metadata.""" + try: + return DocumentChunk( + id=chunk_id or uuid5(NAMESPACE_OID, f"{str(self.document.id)}-{self.chunk_index}"), + text=text, + chunk_size=size, + is_part_of=self.document, + chunk_index=self.chunk_index, + cut_type=cut_type, + contains=[], + metadata={"index_fields": ["text"]}, + ) + except Exception as e: + logger.error(e) + raise e + + def _create_chunk_from_accumulation(self): + """Create a DocumentChunk from current accumulated chunk_data.""" + chunk_text = " ".join(chunk["text"] for chunk in self._accumulated_chunk_data) + return self._create_chunk( + text=chunk_text, + size=self._accumulated_size, + cut_type=self._accumulated_chunk_data[-1]["cut_type"], + ) + + def _emit_chunk(self, chunk_data): + """Emit a chunk when accumulation overflows.""" + if len(self._accumulated_chunk_data) > 0: + chunk = self._create_chunk_from_accumulation() + self._clear_accumulation() + self._accumulate_chunk_data(chunk_data) + else: + # Handle single chunk_data exceeding max_chunk_size + chunk = self._create_chunk( + text=chunk_data["text"], + size=chunk_data["chunk_size"], + cut_type=chunk_data["cut_type"], + chunk_id=chunk_data["chunk_id"], + ) + + self.chunk_index += 1 + return chunk + + async def read(self): + async for content_text in self.get_text(): + for chunk_data in self.get_chunk_data(content_text): + if not self._accumulation_overflows(chunk_data): + self._accumulate_chunk_data(chunk_data) + continue + + yield self._emit_chunk(chunk_data) + + if len(self._accumulated_chunk_data) == 0: + return + + yield self._create_chunk_from_accumulation() diff --git a/cognee/tests/unit/modules/chunking/test_text_chunker.py b/cognee/tests/unit/modules/chunking/test_text_chunker.py new file mode 100644 index 000000000..d535f74b0 --- /dev/null +++ b/cognee/tests/unit/modules/chunking/test_text_chunker.py @@ -0,0 +1,248 @@ +"""Unit tests for TextChunker and TextChunkerWithOverlap behavioral equivalence.""" + +import pytest +from uuid import uuid4 + +from cognee.modules.chunking.TextChunker import TextChunker +from cognee.modules.chunking.text_chunker_with_overlap import TextChunkerWithOverlap +from cognee.modules.data.processing.document_types import Document + + +@pytest.fixture(params=["TextChunker", "TextChunkerWithOverlap"]) +def chunker_class(request): + """Parametrize tests to run against both implementations.""" + return TextChunker if request.param == "TextChunker" else TextChunkerWithOverlap + + +@pytest.fixture +def make_text_generator(): + """Factory for async text generators.""" + + def _factory(*texts): + async def gen(): + for text in texts: + yield text + + return gen + + return _factory + + +async def collect_chunks(chunker): + """Consume async generator and return list of chunks.""" + chunks = [] + async for chunk in chunker.read(): + chunks.append(chunk) + return chunks + + +@pytest.mark.asyncio +async def test_empty_input_produces_no_chunks(chunker_class, make_text_generator): + """Empty input should yield no chunks.""" + document = Document( + id=uuid4(), + name="test_document", + raw_data_location="/test/path", + external_metadata=None, + mime_type="text/plain", + ) + get_text = make_text_generator("") + chunker = chunker_class(document, get_text, max_chunk_size=512) + chunks = await collect_chunks(chunker) + + assert len(chunks) == 0, "Empty input should produce no chunks" + + +@pytest.mark.asyncio +async def test_whitespace_only_input_emits_single_chunk(chunker_class, make_text_generator): + """Whitespace-only input should produce exactly one chunk with unchanged text.""" + whitespace_text = " \n\t \r\n " + document = Document( + id=uuid4(), + name="test_document", + raw_data_location="/test/path", + external_metadata=None, + mime_type="text/plain", + ) + get_text = make_text_generator(whitespace_text) + chunker = chunker_class(document, get_text, max_chunk_size=512) + chunks = await collect_chunks(chunker) + + assert len(chunks) == 1, "Whitespace-only input should produce exactly one chunk" + assert chunks[0].text == whitespace_text, "Chunk text should equal input (whitespace preserved)" + assert chunks[0].chunk_index == 0, "First chunk should have index 0" + + +@pytest.mark.asyncio +async def test_single_paragraph_below_limit_emits_one_chunk(chunker_class, make_text_generator): + """Single paragraph below limit should emit exactly one chunk.""" + text = "This is a short paragraph." + document = Document( + id=uuid4(), + name="test_document", + raw_data_location="/test/path", + external_metadata=None, + mime_type="text/plain", + ) + get_text = make_text_generator(text) + chunker = chunker_class(document, get_text, max_chunk_size=512) + chunks = await collect_chunks(chunker) + + assert len(chunks) == 1, "Single short paragraph should produce exactly one chunk" + assert chunks[0].text == text, "Chunk text should match input" + assert chunks[0].chunk_index == 0, "First chunk should have index 0" + assert chunks[0].chunk_size > 0, "Chunk should have positive size" + + +@pytest.mark.asyncio +async def test_oversized_paragraph_gets_emitted_as_a_single_chunk( + chunker_class, make_text_generator +): + """Oversized paragraph from chunk_by_paragraph should be emitted as single chunk.""" + text = ("A" * 1500) + ". Next sentence." + document = Document( + id=uuid4(), + name="test_document", + raw_data_location="/test/path", + external_metadata=None, + mime_type="text/plain", + ) + get_text = make_text_generator(text) + chunker = chunker_class(document, get_text, max_chunk_size=50) + chunks = await collect_chunks(chunker) + + assert len(chunks) == 2, "Should produce 2 chunks (oversized paragraph + next sentence)" + assert chunks[0].chunk_size > 50, "First chunk should be oversized" + assert chunks[0].chunk_index == 0, "First chunk should have index 0" + assert chunks[1].chunk_index == 1, "Second chunk should have index 1" + + +@pytest.mark.asyncio +async def test_overflow_on_next_paragraph_emits_separate_chunk(chunker_class, make_text_generator): + """First paragraph near limit plus small paragraph should produce two separate chunks.""" + first_para = " ".join(["word"] * 5) + second_para = "Short text." + text = first_para + " " + second_para + document = Document( + id=uuid4(), + name="test_document", + raw_data_location="/test/path", + external_metadata=None, + mime_type="text/plain", + ) + get_text = make_text_generator(text) + chunker = chunker_class(document, get_text, max_chunk_size=10) + chunks = await collect_chunks(chunker) + + assert len(chunks) == 2, "Should produce 2 chunks due to overflow" + assert chunks[0].text.strip() == first_para, "First chunk should contain only first paragraph" + assert chunks[1].text.strip() == second_para, ( + "Second chunk should contain only second paragraph" + ) + assert chunks[0].chunk_index == 0, "First chunk should have index 0" + assert chunks[1].chunk_index == 1, "Second chunk should have index 1" + + +@pytest.mark.asyncio +async def test_small_paragraphs_batch_correctly(chunker_class, make_text_generator): + """Multiple small paragraphs should batch together with joiner spaces counted.""" + paragraphs = [" ".join(["word"] * 12) for _ in range(40)] + text = " ".join(paragraphs) + document = Document( + id=uuid4(), + name="test_document", + raw_data_location="/test/path", + external_metadata=None, + mime_type="text/plain", + ) + get_text = make_text_generator(text) + chunker = chunker_class(document, get_text, max_chunk_size=49) + chunks = await collect_chunks(chunker) + + assert len(chunks) == 20, ( + "Should batch paragraphs (2 per chunk: 12 words × 2 tokens = 24, 24 + 1 joiner + 24 = 49)" + ) + assert all(c.chunk_index == i for i, c in enumerate(chunks)), ( + "Chunk indices should be sequential" + ) + all_text = " ".join(chunk.text.strip() for chunk in chunks) + expected_text = " ".join(paragraphs) + assert all_text == expected_text, "All paragraph text should be preserved with correct spacing" + + +@pytest.mark.asyncio +async def test_alternating_large_and_small_paragraphs_dont_batch( + chunker_class, make_text_generator +): + """Alternating near-max and small paragraphs should each become separate chunks.""" + large1 = "word" * 15 + "." + small1 = "Short." + large2 = "word" * 15 + "." + small2 = "Tiny." + text = large1 + " " + small1 + " " + large2 + " " + small2 + document = Document( + id=uuid4(), + name="test_document", + raw_data_location="/test/path", + external_metadata=None, + mime_type="text/plain", + ) + max_chunk_size = 10 + get_text = make_text_generator(text) + chunker = chunker_class(document, get_text, max_chunk_size=max_chunk_size) + chunks = await collect_chunks(chunker) + + assert len(chunks) == 4, "Should produce multiple chunks" + assert all(c.chunk_index == i for i, c in enumerate(chunks)), ( + "Chunk indices should be sequential" + ) + assert chunks[0].chunk_size > max_chunk_size, ( + "First chunk should be oversized (large paragraph)" + ) + assert chunks[1].chunk_size <= max_chunk_size, "Second chunk should be small (small paragraph)" + assert chunks[2].chunk_size > max_chunk_size, ( + "Third chunk should be oversized (large paragraph)" + ) + assert chunks[3].chunk_size <= max_chunk_size, "Fourth chunk should be small (small paragraph)" + + +@pytest.mark.asyncio +async def test_chunk_indices_and_ids_are_deterministic(chunker_class, make_text_generator): + """Running chunker twice on identical input should produce identical indices and IDs.""" + sentence1 = "one " * 4 + ". " + sentence2 = "two " * 4 + ". " + sentence3 = "one " * 4 + ". " + sentence4 = "two " * 4 + ". " + text = sentence1 + sentence2 + sentence3 + sentence4 + doc_id = uuid4() + max_chunk_size = 20 + + document1 = Document( + id=doc_id, + name="test_document", + raw_data_location="/test/path", + external_metadata=None, + mime_type="text/plain", + ) + get_text1 = make_text_generator(text) + chunker1 = chunker_class(document1, get_text1, max_chunk_size=max_chunk_size) + chunks1 = await collect_chunks(chunker1) + + document2 = Document( + id=doc_id, + name="test_document", + raw_data_location="/test/path", + external_metadata=None, + mime_type="text/plain", + ) + get_text2 = make_text_generator(text) + chunker2 = chunker_class(document2, get_text2, max_chunk_size=max_chunk_size) + chunks2 = await collect_chunks(chunker2) + + assert len(chunks1) == 2, "Should produce exactly 2 chunks (4 sentences, 2 per chunk)" + assert len(chunks2) == 2, "Should produce exactly 2 chunks (4 sentences, 2 per chunk)" + assert [c.chunk_index for c in chunks1] == [0, 1], "First run indices should be [0, 1]" + assert [c.chunk_index for c in chunks2] == [0, 1], "Second run indices should be [0, 1]" + assert chunks1[0].id == chunks2[0].id, "First chunk ID should be deterministic" + assert chunks1[1].id == chunks2[1].id, "Second chunk ID should be deterministic" + assert chunks1[0].id != chunks1[1].id, "Chunk IDs should be unique within a run" diff --git a/cognee/tests/unit/modules/chunking/test_text_chunker_with_overlap.py b/cognee/tests/unit/modules/chunking/test_text_chunker_with_overlap.py new file mode 100644 index 000000000..9d7be6936 --- /dev/null +++ b/cognee/tests/unit/modules/chunking/test_text_chunker_with_overlap.py @@ -0,0 +1,324 @@ +"""Unit tests for TextChunkerWithOverlap overlap behavior.""" + +import sys +import pytest +from uuid import uuid4 +from unittest.mock import patch + +from cognee.modules.chunking.text_chunker_with_overlap import TextChunkerWithOverlap +from cognee.modules.data.processing.document_types import Document +from cognee.tasks.chunks import chunk_by_paragraph + + +@pytest.fixture +def make_text_generator(): + """Factory for async text generators.""" + + def _factory(*texts): + async def gen(): + for text in texts: + yield text + + return gen + + return _factory + + +@pytest.fixture +def make_controlled_chunk_data(): + """Factory for controlled chunk_data generators.""" + + def _factory(*sentences, chunk_size_per_sentence=10): + def _chunk_data(text): + return [ + { + "text": sentence, + "chunk_size": chunk_size_per_sentence, + "cut_type": "sentence", + "chunk_id": uuid4(), + } + for sentence in sentences + ] + + return _chunk_data + + return _factory + + +@pytest.mark.asyncio +async def test_half_overlap_preserves_content_across_chunks( + make_text_generator, make_controlled_chunk_data +): + """With 50% overlap, consecutive chunks should share half their content.""" + s1 = "one" + s2 = "two" + s3 = "three" + s4 = "four" + text = "dummy" + document = Document( + id=uuid4(), + name="test_document", + raw_data_location="/test/path", + external_metadata=None, + mime_type="text/plain", + ) + get_text = make_text_generator(text) + get_chunk_data = make_controlled_chunk_data(s1, s2, s3, s4, chunk_size_per_sentence=10) + chunker = TextChunkerWithOverlap( + document, + get_text, + max_chunk_size=20, + chunk_overlap_ratio=0.5, + get_chunk_data=get_chunk_data, + ) + chunks = [chunk async for chunk in chunker.read()] + + assert len(chunks) == 3, "Should produce exactly 3 chunks (s1+s2, s2+s3, s3+s4)" + assert [c.chunk_index for c in chunks] == [0, 1, 2], "Chunk indices should be [0, 1, 2]" + assert "one" in chunks[0].text and "two" in chunks[0].text, "Chunk 0 should contain s1 and s2" + assert "two" in chunks[1].text and "three" in chunks[1].text, ( + "Chunk 1 should contain s2 (overlap) and s3" + ) + assert "three" in chunks[2].text and "four" in chunks[2].text, ( + "Chunk 2 should contain s3 (overlap) and s4" + ) + + +@pytest.mark.asyncio +async def test_zero_overlap_produces_no_duplicate_content( + make_text_generator, make_controlled_chunk_data +): + """With 0% overlap, no content should appear in multiple chunks.""" + s1 = "one" + s2 = "two" + s3 = "three" + s4 = "four" + text = "dummy" + document = Document( + id=uuid4(), + name="test_document", + raw_data_location="/test/path", + external_metadata=None, + mime_type="text/plain", + ) + get_text = make_text_generator(text) + get_chunk_data = make_controlled_chunk_data(s1, s2, s3, s4, chunk_size_per_sentence=10) + chunker = TextChunkerWithOverlap( + document, + get_text, + max_chunk_size=20, + chunk_overlap_ratio=0.0, + get_chunk_data=get_chunk_data, + ) + chunks = [chunk async for chunk in chunker.read()] + + assert len(chunks) == 2, "Should produce exactly 2 chunks (s1+s2, s3+s4)" + assert "one" in chunks[0].text and "two" in chunks[0].text, ( + "First chunk should contain s1 and s2" + ) + assert "three" in chunks[1].text and "four" in chunks[1].text, ( + "Second chunk should contain s3 and s4" + ) + assert "two" not in chunks[1].text and "three" not in chunks[0].text, ( + "No overlap: end of chunk 0 should not appear in chunk 1" + ) + + +@pytest.mark.asyncio +async def test_small_overlap_ratio_creates_minimal_overlap( + make_text_generator, make_controlled_chunk_data +): + """With 25% overlap ratio, chunks should have minimal overlap.""" + s1 = "alpha" + s2 = "beta" + s3 = "gamma" + s4 = "delta" + s5 = "epsilon" + text = "dummy" + document = Document( + id=uuid4(), + name="test_document", + raw_data_location="/test/path", + external_metadata=None, + mime_type="text/plain", + ) + get_text = make_text_generator(text) + get_chunk_data = make_controlled_chunk_data(s1, s2, s3, s4, s5, chunk_size_per_sentence=10) + chunker = TextChunkerWithOverlap( + document, + get_text, + max_chunk_size=40, + chunk_overlap_ratio=0.25, + get_chunk_data=get_chunk_data, + ) + chunks = [chunk async for chunk in chunker.read()] + + assert len(chunks) == 2, "Should produce exactly 2 chunks" + assert [c.chunk_index for c in chunks] == [0, 1], "Chunk indices should be [0, 1]" + assert all(token in chunks[0].text for token in [s1, s2, s3, s4]), ( + "Chunk 0 should contain s1 through s4" + ) + assert s4 in chunks[1].text and s5 in chunks[1].text, ( + "Chunk 1 should contain overlap s4 and new content s5" + ) + + +@pytest.mark.asyncio +async def test_high_overlap_ratio_creates_significant_overlap( + make_text_generator, make_controlled_chunk_data +): + """With 75% overlap ratio, consecutive chunks should share most content.""" + s1 = "red" + s2 = "blue" + s3 = "green" + s4 = "yellow" + s5 = "purple" + text = "dummy" + document = Document( + id=uuid4(), + name="test_document", + raw_data_location="/test/path", + external_metadata=None, + mime_type="text/plain", + ) + get_text = make_text_generator(text) + get_chunk_data = make_controlled_chunk_data(s1, s2, s3, s4, s5, chunk_size_per_sentence=5) + chunker = TextChunkerWithOverlap( + document, + get_text, + max_chunk_size=20, + chunk_overlap_ratio=0.75, + get_chunk_data=get_chunk_data, + ) + chunks = [chunk async for chunk in chunker.read()] + + assert len(chunks) == 2, "Should produce exactly 2 chunks with 75% overlap" + assert [c.chunk_index for c in chunks] == [0, 1], "Chunk indices should be [0, 1]" + assert all(token in chunks[0].text for token in [s1, s2, s3, s4]), ( + "Chunk 0 should contain s1, s2, s3, s4" + ) + assert all(token in chunks[1].text for token in [s2, s3, s4, s5]), ( + "Chunk 1 should contain s2, s3, s4 (overlap) and s5" + ) + + +@pytest.mark.asyncio +async def test_single_chunk_no_dangling_overlap(make_text_generator, make_controlled_chunk_data): + """Text that fits in one chunk should produce exactly one chunk, no overlap artifact.""" + s1 = "alpha" + s2 = "beta" + text = "dummy" + document = Document( + id=uuid4(), + name="test_document", + raw_data_location="/test/path", + external_metadata=None, + mime_type="text/plain", + ) + get_text = make_text_generator(text) + get_chunk_data = make_controlled_chunk_data(s1, s2, chunk_size_per_sentence=10) + chunker = TextChunkerWithOverlap( + document, + get_text, + max_chunk_size=20, + chunk_overlap_ratio=0.5, + get_chunk_data=get_chunk_data, + ) + chunks = [chunk async for chunk in chunker.read()] + + assert len(chunks) == 1, ( + "Should produce exactly 1 chunk when content fits within max_chunk_size" + ) + assert chunks[0].chunk_index == 0, "Single chunk should have index 0" + assert "alpha" in chunks[0].text and "beta" in chunks[0].text, ( + "Single chunk should contain all content" + ) + + +@pytest.mark.asyncio +async def test_paragraph_chunking_with_overlap(make_text_generator): + """Test that chunk_by_paragraph integration produces 25% overlap between chunks.""" + + def mock_get_embedding_engine(): + class MockEngine: + tokenizer = None + + return MockEngine() + + chunk_by_sentence_module = sys.modules.get("cognee.tasks.chunks.chunk_by_sentence") + + max_chunk_size = 20 + overlap_ratio = 0.25 # 5 token overlap + paragraph_max_size = int(0.5 * overlap_ratio * max_chunk_size) # = 2 + + text = ( + "A0 A1. A2 A3. A4 A5. A6 A7. A8 A9. " # 10 tokens (0-9) + "B0 B1. B2 B3. B4 B5. B6 B7. B8 B9. " # 10 tokens (10-19) + "C0 C1. C2 C3. C4 C5. C6 C7. C8 C9. " # 10 tokens (20-29) + "D0 D1. D2 D3. D4 D5. D6 D7. D8 D9. " # 10 tokens (30-39) + "E0 E1. E2 E3. E4 E5. E6 E7. E8 E9." # 10 tokens (40-49) + ) + + document = Document( + id=uuid4(), + name="test_document", + raw_data_location="/test/path", + external_metadata=None, + mime_type="text/plain", + ) + + get_text = make_text_generator(text) + + def get_chunk_data(text_input): + return chunk_by_paragraph( + text_input, max_chunk_size=paragraph_max_size, batch_paragraphs=True + ) + + with patch.object( + chunk_by_sentence_module, "get_embedding_engine", side_effect=mock_get_embedding_engine + ): + chunker = TextChunkerWithOverlap( + document, + get_text, + max_chunk_size=max_chunk_size, + chunk_overlap_ratio=overlap_ratio, + get_chunk_data=get_chunk_data, + ) + chunks = [chunk async for chunk in chunker.read()] + + assert len(chunks) == 3, f"Should produce exactly 3 chunks, got {len(chunks)}" + + assert chunks[0].chunk_index == 0, "First chunk should have index 0" + assert chunks[1].chunk_index == 1, "Second chunk should have index 1" + assert chunks[2].chunk_index == 2, "Third chunk should have index 2" + + assert "A0" in chunks[0].text, "Chunk 0 should start with A0" + assert "A9" in chunks[0].text, "Chunk 0 should contain A9" + assert "B0" in chunks[0].text, "Chunk 0 should contain B0" + assert "B9" in chunks[0].text, "Chunk 0 should contain up to B9 (20 tokens)" + + assert "B" in chunks[1].text, "Chunk 1 should have overlap from B section" + assert "C" in chunks[1].text, "Chunk 1 should contain C section" + assert "D" in chunks[1].text, "Chunk 1 should contain D section" + + assert "D" in chunks[2].text, "Chunk 2 should have overlap from D section" + assert "E0" in chunks[2].text, "Chunk 2 should contain E0" + assert "E9" in chunks[2].text, "Chunk 2 should end with E9" + + chunk_0_end_words = chunks[0].text.split()[-4:] + chunk_1_words = chunks[1].text.split() + overlap_0_1 = any(word in chunk_1_words for word in chunk_0_end_words) + assert overlap_0_1, ( + f"No overlap detected between chunks 0 and 1. " + f"Chunk 0 ends with: {chunk_0_end_words}, " + f"Chunk 1 starts with: {chunk_1_words[:6]}" + ) + + chunk_1_end_words = chunks[1].text.split()[-4:] + chunk_2_words = chunks[2].text.split() + overlap_1_2 = any(word in chunk_2_words for word in chunk_1_end_words) + assert overlap_1_2, ( + f"No overlap detected between chunks 1 and 2. " + f"Chunk 1 ends with: {chunk_1_end_words}, " + f"Chunk 2 starts with: {chunk_2_words[:6]}" + )