From 6636fe8afdb39b39f1981fe7f1c65c4e087203a1 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Thu, 25 Sep 2025 18:03:17 +0200 Subject: [PATCH 01/61] refactor: Add maximum document batch size for document processing --- .../modules/pipelines/operations/run_tasks.py | 41 +++++++++++-------- 1 file changed, 24 insertions(+), 17 deletions(-) diff --git a/cognee/modules/pipelines/operations/run_tasks.py b/cognee/modules/pipelines/operations/run_tasks.py index 62d4972ad..4a86a5807 100644 --- a/cognee/modules/pipelines/operations/run_tasks.py +++ b/cognee/modules/pipelines/operations/run_tasks.py @@ -37,6 +37,8 @@ from ..tasks.task import Task logger = get_logger("run_tasks(tasks: [Task], data)") +# TODO: See if this parameter should be configurable as input for run_tasks itself +DOCUMENT_BATCH_SIZE = 10 def override_run_tasks(new_gen): @@ -266,24 +268,29 @@ async def run_tasks( if incremental_loading: data = await resolve_data_directories(data) - # Create async tasks per data item that will run the pipeline for the data item - data_item_tasks = [ - asyncio.create_task( - _run_tasks_data_item( - data_item, - dataset, - tasks, - pipeline_name, - pipeline_id, - pipeline_run_id, - context, - user, - incremental_loading, + # Create and gather batches of async tasks of data items that will run the pipeline for the data item + results = [] + for start in range(0, len(data), DOCUMENT_BATCH_SIZE): + document_batch = data[start : start + DOCUMENT_BATCH_SIZE] + + data_item_tasks = [ + asyncio.create_task( + _run_tasks_data_item( + data_item, + dataset, + tasks, + pipeline_name, + pipeline_id, + pipeline_run_id, + context, + user, + incremental_loading, + ) ) - ) - for data_item in data - ] - results = await asyncio.gather(*data_item_tasks) + for data_item in document_batch + ] + + results.extend(await asyncio.gather(*data_item_tasks)) # Remove skipped data items from results results = [result for result in results if result] From 9206d8536b89d4292c0286a35985665ce6f133d1 Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Mon, 6 Oct 2025 17:45:22 +0200 Subject: [PATCH 02/61] initial changes, still need to work on this. commit so I can checkout to diff branch --- .github/workflows/examples_tests.yml | 58 ++++++++++++++++++++++++++-- 1 file changed, 55 insertions(+), 3 deletions(-) diff --git a/.github/workflows/examples_tests.yml b/.github/workflows/examples_tests.yml index 4eb9e184f..406420351 100644 --- a/.github/workflows/examples_tests.yml +++ b/.github/workflows/examples_tests.yml @@ -85,8 +85,8 @@ jobs: run: uv run python ./cognee/tests/tasks/descriptive_metrics/neo4j_metrics_test.py - test-dynamic-steps-metrics: - name: Run Dynamic Steps Example + test-multiple-examples: + name: Run Multiple Example Scripts runs-on: ubuntu-22.04 steps: - name: Check out repository @@ -97,7 +97,7 @@ jobs: with: python-version: '3.11.x' - - name: Run Dynamic Steps Tests + - name: Run Dynamic Steps Example env: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} LLM_MODEL: ${{ secrets.LLM_MODEL }} @@ -110,6 +110,58 @@ jobs: EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} run: uv run python ./examples/python/dynamic_steps_example.py + - name: Run Temporal Example + env: + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + LLM_MODEL: ${{ secrets.LLM_MODEL }} + LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} + LLM_API_KEY: ${{ secrets.LLM_API_KEY }} + LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} + EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} + EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} + EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} + EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} + run: uv run python ./examples/python/temporal_example.py + + - name: Run Ontology Demo Example + env: + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + LLM_MODEL: ${{ secrets.LLM_MODEL }} + LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} + LLM_API_KEY: ${{ secrets.LLM_API_KEY }} + LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} + EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} + EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} + EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} + EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} + run: uv run python ./examples/python/ontology_demo_example.py + + - name: Run Temporal Example + env: + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + LLM_MODEL: ${{ secrets.LLM_MODEL }} + LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} + LLM_API_KEY: ${{ secrets.LLM_API_KEY }} + LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} + EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} + EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} + EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} + EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} + run: uv run python ./examples/python/temporal_example.py + + - name: Run Agentic Reasoning Example + env: + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + LLM_MODEL: ${{ secrets.LLM_MODEL }} + LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} + LLM_API_KEY: ${{ secrets.LLM_API_KEY }} + LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} + EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} + EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} + EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} + EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} + run: uv run python ./examples/python/agentic_reasoning_procurement_example.py + test-memify: name: Run Memify Example runs-on: ubuntu-22.04 From 2932a627bbc674d0a4929b7be82b0e150ec8df50 Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Thu, 9 Oct 2025 09:45:26 +0200 Subject: [PATCH 03/61] test: Potential fix for soft deletion test --- .github/workflows/test_different_operating_systems.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.github/workflows/test_different_operating_systems.yml b/.github/workflows/test_different_operating_systems.yml index 6eb5744f3..e784c9ca3 100644 --- a/.github/workflows/test_different_operating_systems.yml +++ b/.github/workflows/test_different_operating_systems.yml @@ -193,6 +193,13 @@ jobs: with: python-version: ${{ matrix.python-version }} + - name: Path setup + if: ${{ matrix.os }} == 'windows-latest' + shell: bash + run: | + PATH=$(printf '%s\n' "$PATH" | grep -vi '/git/usr/bin' | paste -sd: -) + export PATH + - name: Run Soft Deletion Tests env: ENV: 'dev' From d1d8e334716d81fe6e8b1f1b185743197f27d79e Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Thu, 9 Oct 2025 10:48:21 +0200 Subject: [PATCH 04/61] test: Fix windows tests. First try of potential fixes. --- cognee/tests/test_library.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/cognee/tests/test_library.py b/cognee/tests/test_library.py index c5e6cc64b..fe1a0bdfa 100755 --- a/cognee/tests/test_library.py +++ b/cognee/tests/test_library.py @@ -82,6 +82,11 @@ async def main(): data_root_directory = get_storage_config()["data_root_directory"] assert not os.path.isdir(data_root_directory), "Local data files are not deleted" + from cognee.infrastructure.databases.relational import get_relational_engine + + get_relational_engine().get_session().close() + await get_relational_engine().engine.dispose() + # Assert relational, vector and graph databases have been cleaned properly await cognee.prune.prune_system(metadata=True) @@ -89,7 +94,7 @@ async def main(): collection_names = await connection.table_names() assert len(collection_names) == 0, "LanceDB vector database is not empty" - from cognee.infrastructure.databases.relational import get_relational_engine + db_path = get_relational_engine().db_path dir_path = os.path.dirname(db_path) From ee96d8f940f7248dbc9a6b6cecbccd4c6f7fc24b Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Thu, 9 Oct 2025 11:00:54 +0200 Subject: [PATCH 05/61] chore: fix formatting --- cognee/tests/test_library.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/cognee/tests/test_library.py b/cognee/tests/test_library.py index fe1a0bdfa..2933c77ba 100755 --- a/cognee/tests/test_library.py +++ b/cognee/tests/test_library.py @@ -94,8 +94,6 @@ async def main(): collection_names = await connection.table_names() assert len(collection_names) == 0, "LanceDB vector database is not empty" - - db_path = get_relational_engine().db_path dir_path = os.path.dirname(db_path) file_path = os.path.basename(db_path) From a44ab88519a784ef22ca9d133200124373db8dc7 Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Thu, 9 Oct 2025 17:31:27 +0200 Subject: [PATCH 06/61] test: try calling gc to fix windows issue --- .../databases/relational/sqlalchemy/SqlAlchemyAdapter.py | 4 +++- cognee/tests/test_library.py | 7 ++----- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py b/cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py index 88d2abc7e..4908295ca 100644 --- a/cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py +++ b/cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py @@ -507,7 +507,9 @@ class SQLAlchemyAdapter: if self.engine.dialect.name == "sqlite": await self.engine.dispose(close=True) # Wait for the database connections to close and release the file (Windows) - await asyncio.sleep(2) + import gc + gc.collect() + # await asyncio.sleep(2) db_directory = path.dirname(self.db_path) file_name = path.basename(self.db_path) file_storage = get_file_storage(db_directory) diff --git a/cognee/tests/test_library.py b/cognee/tests/test_library.py index 2933c77ba..c5e6cc64b 100755 --- a/cognee/tests/test_library.py +++ b/cognee/tests/test_library.py @@ -82,11 +82,6 @@ async def main(): data_root_directory = get_storage_config()["data_root_directory"] assert not os.path.isdir(data_root_directory), "Local data files are not deleted" - from cognee.infrastructure.databases.relational import get_relational_engine - - get_relational_engine().get_session().close() - await get_relational_engine().engine.dispose() - # Assert relational, vector and graph databases have been cleaned properly await cognee.prune.prune_system(metadata=True) @@ -94,6 +89,8 @@ async def main(): collection_names = await connection.table_names() assert len(collection_names) == 0, "LanceDB vector database is not empty" + from cognee.infrastructure.databases.relational import get_relational_engine + db_path = get_relational_engine().db_path dir_path = os.path.dirname(db_path) file_path = os.path.basename(db_path) From a7a2631d53ba274511072e312745ecdabf9f8765 Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Thu, 9 Oct 2025 17:50:20 +0200 Subject: [PATCH 07/61] chore: format --- .../databases/relational/sqlalchemy/SqlAlchemyAdapter.py | 1 + 1 file changed, 1 insertion(+) diff --git a/cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py b/cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py index 4908295ca..36ba90db5 100644 --- a/cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py +++ b/cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py @@ -508,6 +508,7 @@ class SQLAlchemyAdapter: await self.engine.dispose(close=True) # Wait for the database connections to close and release the file (Windows) import gc + gc.collect() # await asyncio.sleep(2) db_directory = path.dirname(self.db_path) From abfcbc69d61ec8a71ed83a8dd32894f5e99d8248 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Fri, 10 Oct 2025 15:36:36 +0200 Subject: [PATCH 08/61] refactor: Have embedding calls run in async gather --- cognee/api/v1/cognify/cognify.py | 6 ++--- cognee/tasks/storage/index_data_points.py | 33 +++++++++++++---------- 2 files changed, 22 insertions(+), 17 deletions(-) diff --git a/cognee/api/v1/cognify/cognify.py b/cognee/api/v1/cognify/cognify.py index 1292d243a..6a9f68443 100644 --- a/cognee/api/v1/cognify/cognify.py +++ b/cognee/api/v1/cognify/cognify.py @@ -269,13 +269,13 @@ async def get_default_tasks( # TODO: Find out a better way to do this (Boris's graph_model=graph_model, config=config, custom_prompt=custom_prompt, - task_config={"batch_size": 10}, + task_config={"batch_size": 30}, ), # Generate knowledge graphs from the document chunks. Task( summarize_text, - task_config={"batch_size": 10}, + task_config={"batch_size": 30}, ), - Task(add_data_points, task_config={"batch_size": 10}), + Task(add_data_points, task_config={"batch_size": 100}), ] return default_tasks diff --git a/cognee/tasks/storage/index_data_points.py b/cognee/tasks/storage/index_data_points.py index 362412657..ebc4640d6 100644 --- a/cognee/tasks/storage/index_data_points.py +++ b/cognee/tasks/storage/index_data_points.py @@ -1,6 +1,6 @@ -from cognee.shared.logging_utils import get_logger +import asyncio -from cognee.infrastructure.databases.exceptions import EmbeddingException +from cognee.shared.logging_utils import get_logger from cognee.infrastructure.databases.vector import get_vector_engine from cognee.infrastructure.engine import DataPoint @@ -33,18 +33,23 @@ async def index_data_points(data_points: list[DataPoint]): indexed_data_point.metadata["index_fields"] = [field_name] index_points[index_name].append(indexed_data_point) - for index_name_and_field, indexable_points in index_points.items(): - first_occurence = index_name_and_field.index("_") - index_name = index_name_and_field[:first_occurence] - field_name = index_name_and_field[first_occurence + 1 :] - try: - # In case the amount of indexable points is too large we need to send them in batches - batch_size = vector_engine.embedding_engine.get_batch_size() - for i in range(0, len(indexable_points), batch_size): - batch = indexable_points[i : i + batch_size] - await vector_engine.index_data_points(index_name, field_name, batch) - except EmbeddingException as e: - logger.warning(f"Failed to index data points for {index_name}.{field_name}: {e}") + tasks: list[asyncio.Task] = [] + batch_size = vector_engine.embedding_engine.get_batch_size() + + for index_name_and_field, points in index_points.items(): + first = index_name_and_field.index("_") + index_name = index_name_and_field[:first] + field_name = index_name_and_field[first + 1 :] + + # Split in the usual “range step batch_size” manner + for i in range(0, len(points), batch_size): + batch = points[i : i + batch_size] + tasks.append( + asyncio.create_task(vector_engine.index_data_points(index_name, field_name, batch)) + ) + + # Fire them all and wait until every task is done. + await asyncio.gather(*tasks) return data_points From 757d745b5d262975c05f5fe3bb3f410f5c3d72b7 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Fri, 10 Oct 2025 17:12:09 +0200 Subject: [PATCH 09/61] refactor: Optimize cognification speed --- cognee/api/v1/cognify/cognify.py | 4 ++-- .../databases/vector/embeddings/config.py | 4 ++-- cognee/tasks/storage/index_graph_edges.py | 15 +++++++++++---- 3 files changed, 15 insertions(+), 8 deletions(-) diff --git a/cognee/api/v1/cognify/cognify.py b/cognee/api/v1/cognify/cognify.py index 6a9f68443..30afb269a 100644 --- a/cognee/api/v1/cognify/cognify.py +++ b/cognee/api/v1/cognify/cognify.py @@ -269,11 +269,11 @@ async def get_default_tasks( # TODO: Find out a better way to do this (Boris's graph_model=graph_model, config=config, custom_prompt=custom_prompt, - task_config={"batch_size": 30}, + task_config={"batch_size": 100}, ), # Generate knowledge graphs from the document chunks. Task( summarize_text, - task_config={"batch_size": 30}, + task_config={"batch_size": 100}, ), Task(add_data_points, task_config={"batch_size": 100}), ] diff --git a/cognee/infrastructure/databases/vector/embeddings/config.py b/cognee/infrastructure/databases/vector/embeddings/config.py index 24f724151..dcb55f4a4 100644 --- a/cognee/infrastructure/databases/vector/embeddings/config.py +++ b/cognee/infrastructure/databases/vector/embeddings/config.py @@ -26,9 +26,9 @@ class EmbeddingConfig(BaseSettings): def model_post_init(self, __context) -> None: # If embedding batch size is not defined use 2048 as default for OpenAI and 100 for all other embedding models if not self.embedding_batch_size and self.embedding_provider.lower() == "openai": - self.embedding_batch_size = 2048 + self.embedding_batch_size = 30 elif not self.embedding_batch_size: - self.embedding_batch_size = 100 + self.embedding_batch_size = 10 def to_dict(self) -> dict: """ diff --git a/cognee/tasks/storage/index_graph_edges.py b/cognee/tasks/storage/index_graph_edges.py index b7bf7a2b9..4fa8cfc75 100644 --- a/cognee/tasks/storage/index_graph_edges.py +++ b/cognee/tasks/storage/index_graph_edges.py @@ -1,3 +1,5 @@ +import asyncio + from cognee.modules.engine.utils.generate_edge_id import generate_edge_id from cognee.shared.logging_utils import get_logger from collections import Counter @@ -76,15 +78,20 @@ async def index_graph_edges( indexed_data_point.metadata["index_fields"] = [field_name] index_points[index_name].append(indexed_data_point) + # Get maximum batch size for embedding model + batch_size = vector_engine.embedding_engine.get_batch_size() + tasks: list[asyncio.Task] = [] + for index_name, indexable_points in index_points.items(): index_name, field_name = index_name.split(".") - # Get maximum batch size for embedding model - batch_size = vector_engine.embedding_engine.get_batch_size() - # We save the data in batches of {batch_size} to not put a lot of pressure on the database + # Create embedding tasks to run in parallel later for start in range(0, len(indexable_points), batch_size): batch = indexable_points[start : start + batch_size] - await vector_engine.index_data_points(index_name, field_name, batch) + tasks.append(vector_engine.index_data_points(index_name, field_name, batch)) + + # Start all embedding tasks and wait for completion + await asyncio.gather(*tasks) return None From 13d1133680a241a9423b57d760c2319c20b80670 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Fri, 10 Oct 2025 17:14:10 +0200 Subject: [PATCH 10/61] chore: Change comments --- cognee/tasks/storage/index_data_points.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cognee/tasks/storage/index_data_points.py b/cognee/tasks/storage/index_data_points.py index ebc4640d6..902789c80 100644 --- a/cognee/tasks/storage/index_data_points.py +++ b/cognee/tasks/storage/index_data_points.py @@ -41,14 +41,14 @@ async def index_data_points(data_points: list[DataPoint]): index_name = index_name_and_field[:first] field_name = index_name_and_field[first + 1 :] - # Split in the usual “range step batch_size” manner + # Create embedding requests per batch to run in parallel later for i in range(0, len(points), batch_size): batch = points[i : i + batch_size] tasks.append( asyncio.create_task(vector_engine.index_data_points(index_name, field_name, batch)) ) - # Fire them all and wait until every task is done. + # Run all embedding requests in parallel await asyncio.gather(*tasks) return data_points From ecb285e36613a22d1ad7338b5aa13ade9ff21a9b Mon Sep 17 00:00:00 2001 From: vasilije Date: Sun, 12 Oct 2025 13:46:12 +0200 Subject: [PATCH 11/61] added formatting --- cognee/modules/pipelines/operations/run_tasks.py | 1 - 1 file changed, 1 deletion(-) diff --git a/cognee/modules/pipelines/operations/run_tasks.py b/cognee/modules/pipelines/operations/run_tasks.py index 2a5bf81a8..2e0055384 100644 --- a/cognee/modules/pipelines/operations/run_tasks.py +++ b/cognee/modules/pipelines/operations/run_tasks.py @@ -91,7 +91,6 @@ async def run_tasks( if incremental_loading: data = await resolve_data_directories(data) - # Create and gather batches of async tasks of data items that will run the pipeline for the data item results = [] for start in range(0, len(data), DOCUMENT_BATCH_SIZE): From ef5965224ac49df72a21e56d9b0537df73ef6a37 Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Mon, 13 Oct 2025 12:44:55 +0200 Subject: [PATCH 12/61] fix: Revert changes made to sql alchemy adapter for lib test --- .../databases/relational/sqlalchemy/SqlAlchemyAdapter.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py b/cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py index 36ba90db5..88d2abc7e 100644 --- a/cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py +++ b/cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py @@ -507,10 +507,7 @@ class SQLAlchemyAdapter: if self.engine.dialect.name == "sqlite": await self.engine.dispose(close=True) # Wait for the database connections to close and release the file (Windows) - import gc - - gc.collect() - # await asyncio.sleep(2) + await asyncio.sleep(2) db_directory = path.dirname(self.db_path) file_name = path.basename(self.db_path) file_storage = get_file_storage(db_directory) From f81aeff0096664a7556d451056f813e4994ab2d2 Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Mon, 13 Oct 2025 12:50:26 +0200 Subject: [PATCH 13/61] Revert "fix: Revert changes made to sql alchemy adapter for lib test" This reverts commit ef5965224ac49df72a21e56d9b0537df73ef6a37. --- .../databases/relational/sqlalchemy/SqlAlchemyAdapter.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py b/cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py index 88d2abc7e..36ba90db5 100644 --- a/cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py +++ b/cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py @@ -507,7 +507,10 @@ class SQLAlchemyAdapter: if self.engine.dialect.name == "sqlite": await self.engine.dispose(close=True) # Wait for the database connections to close and release the file (Windows) - await asyncio.sleep(2) + import gc + + gc.collect() + # await asyncio.sleep(2) db_directory = path.dirname(self.db_path) file_name = path.basename(self.db_path) file_storage = get_file_storage(db_directory) From 74ce78ddfe1542b470bade83c75828a70efdf4d4 Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Mon, 13 Oct 2025 12:52:05 +0200 Subject: [PATCH 14/61] fix: Revert changes to sql alchemy for lib test --- .../databases/relational/sqlalchemy/SqlAlchemyAdapter.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py b/cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py index 93252ab6e..380ce9917 100644 --- a/cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py +++ b/cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py @@ -507,10 +507,7 @@ class SQLAlchemyAdapter: if self.engine.dialect.name == "sqlite": await self.engine.dispose(close=True) # Wait for the database connections to close and release the file (Windows) - import gc - - gc.collect() - # await asyncio.sleep(2) + await asyncio.sleep(2) db_directory = path.dirname(self.db_path) file_name = path.basename(self.db_path) file_storage = get_file_storage(db_directory) From 832243034f291d63c70eeeb43828ccdbc69d7bc0 Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Mon, 13 Oct 2025 16:21:19 +0200 Subject: [PATCH 15/61] test: small change in soft delete test --- .github/workflows/test_different_operating_systems.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test_different_operating_systems.yml b/.github/workflows/test_different_operating_systems.yml index e784c9ca3..00e387ac4 100644 --- a/.github/workflows/test_different_operating_systems.yml +++ b/.github/workflows/test_different_operating_systems.yml @@ -197,7 +197,7 @@ jobs: if: ${{ matrix.os }} == 'windows-latest' shell: bash run: | - PATH=$(printf '%s\n' "$PATH" | grep -vi '/git/usr/bin' | paste -sd: -) + PATH=$(printf '%s' "$PATH" | tr ':' $'\n' | grep -vi '/git/usr/bin' | paste -sd: -) export PATH - name: Run Soft Deletion Tests From eb631a23ad6eeaba9c1111b598a6f4f955cd6c86 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Tue, 14 Oct 2025 13:57:41 +0200 Subject: [PATCH 16/61] refactor: set default numbers that are more reasonable --- cognee/api/v1/cognify/cognify.py | 6 +++--- cognee/infrastructure/databases/vector/embeddings/config.py | 5 ++--- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/cognee/api/v1/cognify/cognify.py b/cognee/api/v1/cognify/cognify.py index 30afb269a..898c35518 100644 --- a/cognee/api/v1/cognify/cognify.py +++ b/cognee/api/v1/cognify/cognify.py @@ -269,13 +269,13 @@ async def get_default_tasks( # TODO: Find out a better way to do this (Boris's graph_model=graph_model, config=config, custom_prompt=custom_prompt, - task_config={"batch_size": 100}, + task_config={"batch_size": 20}, ), # Generate knowledge graphs from the document chunks. Task( summarize_text, - task_config={"batch_size": 100}, + task_config={"batch_size": 20}, ), - Task(add_data_points, task_config={"batch_size": 100}), + Task(add_data_points, task_config={"batch_size": 20}), ] return default_tasks diff --git a/cognee/infrastructure/databases/vector/embeddings/config.py b/cognee/infrastructure/databases/vector/embeddings/config.py index dcb55f4a4..314adbd99 100644 --- a/cognee/infrastructure/databases/vector/embeddings/config.py +++ b/cognee/infrastructure/databases/vector/embeddings/config.py @@ -24,11 +24,10 @@ class EmbeddingConfig(BaseSettings): model_config = SettingsConfigDict(env_file=".env", extra="allow") def model_post_init(self, __context) -> None: - # If embedding batch size is not defined use 2048 as default for OpenAI and 100 for all other embedding models if not self.embedding_batch_size and self.embedding_provider.lower() == "openai": - self.embedding_batch_size = 30 + self.embedding_batch_size = 1024 elif not self.embedding_batch_size: - self.embedding_batch_size = 10 + self.embedding_batch_size = 100 def to_dict(self) -> dict: """ From 84a23756f5c77ef3c7e0c78c4aff122416249341 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Tue, 14 Oct 2025 14:25:38 +0200 Subject: [PATCH 17/61] fix: Change chunk_size ot batch_size for temporal task --- cognee/api/v1/cognify/cognify.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cognee/api/v1/cognify/cognify.py b/cognee/api/v1/cognify/cognify.py index 898c35518..2c87dbc4b 100644 --- a/cognee/api/v1/cognify/cognify.py +++ b/cognee/api/v1/cognify/cognify.py @@ -311,7 +311,7 @@ async def get_temporal_tasks( max_chunk_size=chunk_size or get_max_chunk_tokens(), chunker=chunker, ), - Task(extract_events_and_timestamps, task_config={"chunk_size": 10}), + Task(extract_events_and_timestamps, task_config={"batch_size": 10}), Task(extract_knowledge_graph_from_events), Task(add_data_points, task_config={"batch_size": 10}), ] From 98daadbb0461ae99935032bde96d8c056f874050 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Tue, 14 Oct 2025 20:29:55 +0200 Subject: [PATCH 18/61] refactor: Add tenacity retry mechanism --- .../embeddings/LiteLLMEmbeddingEngine.py | 18 ++++++++++++++++-- .../embeddings/OllamaEmbeddingEngine.py | 19 ++++++++++++++++--- poetry.lock | 2 +- pyproject.toml | 3 ++- uv.lock | 4 +++- 5 files changed, 38 insertions(+), 8 deletions(-) diff --git a/cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py b/cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py index d68941d25..2a71d674d 100644 --- a/cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py +++ b/cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py @@ -1,8 +1,17 @@ import asyncio +import logging + from cognee.shared.logging_utils import get_logger from typing import List, Optional import numpy as np import math +from tenacity import ( + retry, + stop_after_delay, + wait_exponential_jitter, + retry_if_not_exception_type, + before_sleep_log, +) import litellm import os from cognee.infrastructure.databases.vector.embeddings.EmbeddingEngine import EmbeddingEngine @@ -76,8 +85,13 @@ class LiteLLMEmbeddingEngine(EmbeddingEngine): enable_mocking = str(enable_mocking).lower() self.mock = enable_mocking in ("true", "1", "yes") - @embedding_sleep_and_retry_async() - @embedding_rate_limit_async + @retry( + stop=stop_after_delay(180), + wait=wait_exponential_jitter(1, 180), + retry=retry_if_not_exception_type(litellm.exceptions.NotFoundError), + before_sleep=before_sleep_log(logger, logging.DEBUG), + reraise=True, + ) async def embed_text(self, text: List[str]) -> List[List[float]]: """ Embed a list of text strings into vector representations. diff --git a/cognee/infrastructure/databases/vector/embeddings/OllamaEmbeddingEngine.py b/cognee/infrastructure/databases/vector/embeddings/OllamaEmbeddingEngine.py index e79ba3f6a..b8ee9c7df 100644 --- a/cognee/infrastructure/databases/vector/embeddings/OllamaEmbeddingEngine.py +++ b/cognee/infrastructure/databases/vector/embeddings/OllamaEmbeddingEngine.py @@ -3,8 +3,16 @@ from cognee.shared.logging_utils import get_logger import aiohttp from typing import List, Optional import os - +import litellm +import logging import aiohttp.http_exceptions +from tenacity import ( + retry, + stop_after_delay, + wait_exponential_jitter, + retry_if_not_exception_type, + before_sleep_log, +) from cognee.infrastructure.databases.vector.embeddings.EmbeddingEngine import EmbeddingEngine from cognee.infrastructure.llm.tokenizer.HuggingFace import ( @@ -69,7 +77,6 @@ class OllamaEmbeddingEngine(EmbeddingEngine): enable_mocking = str(enable_mocking).lower() self.mock = enable_mocking in ("true", "1", "yes") - @embedding_rate_limit_async async def embed_text(self, text: List[str]) -> List[List[float]]: """ Generate embedding vectors for a list of text prompts. @@ -92,7 +99,13 @@ class OllamaEmbeddingEngine(EmbeddingEngine): embeddings = await asyncio.gather(*[self._get_embedding(prompt) for prompt in text]) return embeddings - @embedding_sleep_and_retry_async() + @retry( + stop=stop_after_delay(180), + wait=wait_exponential_jitter(1, 180), + retry=retry_if_not_exception_type(litellm.exceptions.NotFoundError), + before_sleep=before_sleep_log(logger, logging.DEBUG), + reraise=True, + ) async def _get_embedding(self, prompt: str) -> List[float]: """ Internal method to call the Ollama embeddings endpoint for a single prompt. diff --git a/poetry.lock b/poetry.lock index 551295733..ffc5ec575 100644 --- a/poetry.lock +++ b/poetry.lock @@ -12738,4 +12738,4 @@ posthog = ["posthog"] [metadata] lock-version = "2.1" python-versions = ">=3.10,<=3.13" -content-hash = "38353807b06e5c06caaa107979529937b978204f0f405c6b38cee283f4a49d3c" +content-hash = "d8cd8a8db46416e0c844ff90df5bd64551ebf9a0c338fbb2023a61008ff5941d" diff --git a/pyproject.toml b/pyproject.toml index 3df57e1f5..7ac2915d4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -54,7 +54,8 @@ dependencies = [ "networkx>=3.4.2,<4", "uvicorn>=0.34.0,<1.0.0", "gunicorn>=20.1.0,<24", - "websockets>=15.0.1,<16.0.0" + "websockets>=15.0.1,<16.0.0", + "tenacity>=9.0.0", ] [project.optional-dependencies] diff --git a/uv.lock b/uv.lock index 570da9289..5c06b96be 100644 --- a/uv.lock +++ b/uv.lock @@ -856,7 +856,7 @@ wheels = [ [[package]] name = "cognee" -version = "0.3.4" +version = "0.3.5" source = { editable = "." } dependencies = [ { name = "aiofiles" }, @@ -892,6 +892,7 @@ dependencies = [ { name = "rdflib" }, { name = "sqlalchemy" }, { name = "structlog" }, + { name = "tenacity" }, { name = "tiktoken" }, { name = "typing-extensions" }, { name = "uvicorn" }, @@ -1086,6 +1087,7 @@ requires-dist = [ { name = "sentry-sdk", extras = ["fastapi"], marker = "extra == 'monitoring'", specifier = ">=2.9.0,<3" }, { name = "sqlalchemy", specifier = ">=2.0.39,<3.0.0" }, { name = "structlog", specifier = ">=25.2.0,<26" }, + { name = "tenacity", specifier = ">=9.0.0" }, { name = "tiktoken", specifier = ">=0.8.0,<1.0.0" }, { name = "transformers", marker = "extra == 'codegraph'", specifier = ">=4.46.3,<5" }, { name = "transformers", marker = "extra == 'huggingface'", specifier = ">=4.46.3,<5" }, From 1b28f137431d30c940568406fab1678db9276c28 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Wed, 15 Oct 2025 13:32:17 +0200 Subject: [PATCH 19/61] refactor: Optimize Cognee speed --- cognee/api/v1/cognify/cognify.py | 6 +++--- .../embeddings/FastembedEmbeddingEngine.py | 20 +++++++++++++++++-- .../embeddings/LiteLLMEmbeddingEngine.py | 11 ++-------- .../embeddings/OllamaEmbeddingEngine.py | 4 ++-- .../databases/vector/embeddings/config.py | 4 ++-- 5 files changed, 27 insertions(+), 18 deletions(-) diff --git a/cognee/api/v1/cognify/cognify.py b/cognee/api/v1/cognify/cognify.py index 9215c9369..3032bd4e8 100644 --- a/cognee/api/v1/cognify/cognify.py +++ b/cognee/api/v1/cognify/cognify.py @@ -269,13 +269,13 @@ async def get_default_tasks( # TODO: Find out a better way to do this (Boris's graph_model=graph_model, config=config, custom_prompt=custom_prompt, - task_config={"batch_size": 20}, + task_config={"batch_size": 100}, ), # Generate knowledge graphs from the document chunks. Task( summarize_text, - task_config={"batch_size": 20}, + task_config={"batch_size": 100}, ), - Task(add_data_points, task_config={"batch_size": 20}), + Task(add_data_points, task_config={"batch_size": 100}), ] return default_tasks diff --git a/cognee/infrastructure/databases/vector/embeddings/FastembedEmbeddingEngine.py b/cognee/infrastructure/databases/vector/embeddings/FastembedEmbeddingEngine.py index e34ab5d9d..c2acd516e 100644 --- a/cognee/infrastructure/databases/vector/embeddings/FastembedEmbeddingEngine.py +++ b/cognee/infrastructure/databases/vector/embeddings/FastembedEmbeddingEngine.py @@ -1,8 +1,17 @@ -from cognee.shared.logging_utils import get_logger +import os +import logging from typing import List, Optional from fastembed import TextEmbedding import litellm -import os +from tenacity import ( + retry, + stop_after_delay, + wait_exponential_jitter, + retry_if_not_exception_type, + before_sleep_log, +) + +from cognee.shared.logging_utils import get_logger from cognee.infrastructure.databases.vector.embeddings.EmbeddingEngine import EmbeddingEngine from cognee.infrastructure.databases.exceptions import EmbeddingException from cognee.infrastructure.llm.tokenizer.TikToken import ( @@ -57,6 +66,13 @@ class FastembedEmbeddingEngine(EmbeddingEngine): enable_mocking = str(enable_mocking).lower() self.mock = enable_mocking in ("true", "1", "yes") + @retry( + stop=stop_after_delay(128), + wait=wait_exponential_jitter(2, 128), + retry=retry_if_not_exception_type(litellm.exceptions.NotFoundError), + before_sleep=before_sleep_log(logger, logging.DEBUG), + reraise=True, + ) async def embed_text(self, text: List[str]) -> List[List[float]]: """ Embed the given text into numerical vectors. diff --git a/cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py b/cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py index 302950f66..03ce86bee 100644 --- a/cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py +++ b/cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py @@ -16,9 +16,6 @@ import litellm import os from cognee.infrastructure.databases.vector.embeddings.EmbeddingEngine import EmbeddingEngine from cognee.infrastructure.databases.exceptions import EmbeddingException -from cognee.infrastructure.llm.tokenizer.Gemini import ( - GeminiTokenizer, -) from cognee.infrastructure.llm.tokenizer.HuggingFace import ( HuggingFaceTokenizer, ) @@ -28,10 +25,6 @@ from cognee.infrastructure.llm.tokenizer.Mistral import ( from cognee.infrastructure.llm.tokenizer.TikToken import ( TikTokenTokenizer, ) -from cognee.infrastructure.databases.vector.embeddings.embedding_rate_limiter import ( - embedding_rate_limit_async, - embedding_sleep_and_retry_async, -) litellm.set_verbose = False logger = get_logger("LiteLLMEmbeddingEngine") @@ -86,8 +79,8 @@ class LiteLLMEmbeddingEngine(EmbeddingEngine): self.mock = enable_mocking in ("true", "1", "yes") @retry( - stop=stop_after_delay(180), - wait=wait_exponential_jitter(1, 180), + stop=stop_after_delay(128), + wait=wait_exponential_jitter(2, 128), retry=retry_if_not_exception_type(litellm.exceptions.NotFoundError), before_sleep=before_sleep_log(logger, logging.DEBUG), reraise=True, diff --git a/cognee/infrastructure/databases/vector/embeddings/OllamaEmbeddingEngine.py b/cognee/infrastructure/databases/vector/embeddings/OllamaEmbeddingEngine.py index b8ee9c7df..2882b679a 100644 --- a/cognee/infrastructure/databases/vector/embeddings/OllamaEmbeddingEngine.py +++ b/cognee/infrastructure/databases/vector/embeddings/OllamaEmbeddingEngine.py @@ -100,8 +100,8 @@ class OllamaEmbeddingEngine(EmbeddingEngine): return embeddings @retry( - stop=stop_after_delay(180), - wait=wait_exponential_jitter(1, 180), + stop=stop_after_delay(128), + wait=wait_exponential_jitter(2, 128), retry=retry_if_not_exception_type(litellm.exceptions.NotFoundError), before_sleep=before_sleep_log(logger, logging.DEBUG), reraise=True, diff --git a/cognee/infrastructure/databases/vector/embeddings/config.py b/cognee/infrastructure/databases/vector/embeddings/config.py index 314adbd99..56cd79678 100644 --- a/cognee/infrastructure/databases/vector/embeddings/config.py +++ b/cognee/infrastructure/databases/vector/embeddings/config.py @@ -25,9 +25,9 @@ class EmbeddingConfig(BaseSettings): def model_post_init(self, __context) -> None: if not self.embedding_batch_size and self.embedding_provider.lower() == "openai": - self.embedding_batch_size = 1024 + self.embedding_batch_size = 36 elif not self.embedding_batch_size: - self.embedding_batch_size = 100 + self.embedding_batch_size = 36 def to_dict(self) -> dict: """ From fc4440da8c7b7cdfd4087f34c40ac90cc86bb839 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Wed, 15 Oct 2025 14:43:21 +0200 Subject: [PATCH 20/61] refactor: update env template --- .env.template | 5 ++--- .../loaders/external/advanced_pdf_loader.py | 10 ++-------- 2 files changed, 4 insertions(+), 11 deletions(-) diff --git a/.env.template b/.env.template index 7fd3ba9e8..3137636d3 100644 --- a/.env.template +++ b/.env.template @@ -28,11 +28,10 @@ EMBEDDING_ENDPOINT="" EMBEDDING_API_VERSION="" EMBEDDING_DIMENSIONS=3072 EMBEDDING_MAX_TOKENS=8191 +EMBEDDING_BATCH_SIZE=36 # If embedding key is not provided same key set for LLM_API_KEY will be used #EMBEDDING_API_KEY="your_api_key" -# Note: OpenAI support up to 2048 elements and Gemini supports a maximum of 100 elements in an embedding batch, -# Cognee sets the optimal batch size for OpenAI and Gemini, but a custom size can be defined if necessary for other models -#EMBEDDING_BATCH_SIZE=2048 + # If using BAML structured output these env variables will be used BAML_LLM_PROVIDER=openai diff --git a/cognee/infrastructure/loaders/external/advanced_pdf_loader.py b/cognee/infrastructure/loaders/external/advanced_pdf_loader.py index 7bab8cac6..6d1412b77 100644 --- a/cognee/infrastructure/loaders/external/advanced_pdf_loader.py +++ b/cognee/infrastructure/loaders/external/advanced_pdf_loader.py @@ -14,14 +14,6 @@ from cognee.infrastructure.loaders.external.pypdf_loader import PyPdfLoader logger = get_logger(__name__) -try: - from unstructured.partition.pdf import partition_pdf -except ImportError as e: - logger.info( - "unstructured[pdf] not installed, can't use AdvancedPdfLoader, will use PyPdfLoader instead." - ) - raise ImportError from e - @dataclass class _PageBuffer: @@ -88,6 +80,8 @@ class AdvancedPdfLoader(LoaderInterface): **kwargs, } # Use partition to extract elements + from unstructured.partition.pdf import partition_pdf + elements = partition_pdf(**partition_kwargs) # Process elements into text content From 8692cd13381204a98100fb296bf971ad14ea7a32 Mon Sep 17 00:00:00 2001 From: Daulet Amirkhanov Date: Wed, 15 Oct 2025 16:03:17 +0100 Subject: [PATCH 21/61] feat: add count_nodes and count_edges methods to GraphDBInterface --- .../infrastructure/databases/graph/graph_db_interface.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/cognee/infrastructure/databases/graph/graph_db_interface.py b/cognee/infrastructure/databases/graph/graph_db_interface.py index 65afdf275..abfdff784 100644 --- a/cognee/infrastructure/databases/graph/graph_db_interface.py +++ b/cognee/infrastructure/databases/graph/graph_db_interface.py @@ -159,6 +159,14 @@ class GraphDBInterface(ABC): - get_connections """ + @abstractmethod + async def count_nodes(self) -> int: + raise NotImplementedError + + @abstractmethod + async def count_edges(self) -> int: + raise NotImplementedError + @abstractmethod async def query(self, query: str, params: dict) -> List[Any]: """ From 5663c3fe3ab80f0eee7adb3576af4b579a1d8306 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Wed, 15 Oct 2025 17:38:18 +0200 Subject: [PATCH 22/61] refactor: add batch size param to temporal graphs --- cognee/api/v1/cognify/cognify.py | 34 ++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/cognee/api/v1/cognify/cognify.py b/cognee/api/v1/cognify/cognify.py index 3032bd4e8..d29d8c939 100644 --- a/cognee/api/v1/cognify/cognify.py +++ b/cognee/api/v1/cognify/cognify.py @@ -44,6 +44,7 @@ async def cognify( graph_model: BaseModel = KnowledgeGraph, chunker=TextChunker, chunk_size: int = None, + batch_size: int = None, config: Config = None, vector_db_config: dict = None, graph_db_config: dict = None, @@ -105,6 +106,7 @@ async def cognify( Formula: min(embedding_max_completion_tokens, llm_max_completion_tokens // 2) Default limits: ~512-8192 tokens depending on models. Smaller chunks = more granular but potentially fragmented knowledge. + batch_size: Number of chunks to be processed in a single batch in Cognify tasks. vector_db_config: Custom vector database configuration for embeddings storage. graph_db_config: Custom graph database configuration for relationship storage. run_in_background: If True, starts processing asynchronously and returns immediately. @@ -209,10 +211,18 @@ async def cognify( } if temporal_cognify: - tasks = await get_temporal_tasks(user, chunker, chunk_size) + tasks = await get_temporal_tasks( + user=user, chunker=chunker, chunk_size=chunk_size, batch_size=batch_size + ) else: tasks = await get_default_tasks( - user, graph_model, chunker, chunk_size, config, custom_prompt + user=user, + graph_model=graph_model, + chunker=chunker, + chunk_size=chunk_size, + config=config, + custom_prompt=custom_prompt, + batch_size=batch_size, ) # By calling get pipeline executor we get a function that will have the run_pipeline run in the background or a function that we will need to wait for @@ -238,6 +248,7 @@ async def get_default_tasks( # TODO: Find out a better way to do this (Boris's chunk_size: int = None, config: Config = None, custom_prompt: Optional[str] = None, + batch_size: int = 100, ) -> list[Task]: if config is None: ontology_config = get_ontology_env_config() @@ -256,6 +267,9 @@ async def get_default_tasks( # TODO: Find out a better way to do this (Boris's "ontology_config": {"ontology_resolver": get_default_ontology_resolver()} } + if batch_size is None: + batch_size = 100 + default_tasks = [ Task(classify_documents), Task(check_permissions_on_dataset, user=user, permissions=["write"]), @@ -269,20 +283,20 @@ async def get_default_tasks( # TODO: Find out a better way to do this (Boris's graph_model=graph_model, config=config, custom_prompt=custom_prompt, - task_config={"batch_size": 100}, + task_config={"batch_size": batch_size}, ), # Generate knowledge graphs from the document chunks. Task( summarize_text, - task_config={"batch_size": 100}, + task_config={"batch_size": batch_size}, ), - Task(add_data_points, task_config={"batch_size": 100}), + Task(add_data_points, task_config={"batch_size": batch_size}), ] return default_tasks async def get_temporal_tasks( - user: User = None, chunker=TextChunker, chunk_size: int = None + user: User = None, chunker=TextChunker, chunk_size: int = None, batch_size: int = 10 ) -> list[Task]: """ Builds and returns a list of temporal processing tasks to be executed in sequence. @@ -299,10 +313,14 @@ async def get_temporal_tasks( user (User, optional): The user requesting task execution, used for permission checks. chunker (Callable, optional): A text chunking function/class to split documents. Defaults to TextChunker. chunk_size (int, optional): Maximum token size per chunk. If not provided, uses system default. + batch_size (int, optional): Number of chunks to process in a single batch in Cognify Returns: list[Task]: A list of Task objects representing the temporal processing pipeline. """ + if batch_size is None: + batch_size = 10 + temporal_tasks = [ Task(classify_documents), Task(check_permissions_on_dataset, user=user, permissions=["write"]), @@ -311,9 +329,9 @@ async def get_temporal_tasks( max_chunk_size=chunk_size or get_max_chunk_tokens(), chunker=chunker, ), - Task(extract_events_and_timestamps, task_config={"batch_size": 10}), + Task(extract_events_and_timestamps, task_config={"batch_size": batch_size}), Task(extract_knowledge_graph_from_events), - Task(add_data_points, task_config={"batch_size": 10}), + Task(add_data_points, task_config={"batch_size": batch_size}), ] return temporal_tasks From f3ec1801025eb5cc1c2dc899a8aa3eca02ae4165 Mon Sep 17 00:00:00 2001 From: Daulet Amirkhanov Date: Wed, 15 Oct 2025 16:39:25 +0100 Subject: [PATCH 23/61] Implement count_edges and count_methods for Kuzu --- .../databases/graph/kuzu/adapter.py | 16 ++++++++++ cognee/tests/test_kuzu.py | 29 ++++++++++++++++--- 2 files changed, 41 insertions(+), 4 deletions(-) diff --git a/cognee/infrastructure/databases/graph/kuzu/adapter.py b/cognee/infrastructure/databases/graph/kuzu/adapter.py index 7b772097f..a31726c9a 100644 --- a/cognee/infrastructure/databases/graph/kuzu/adapter.py +++ b/cognee/infrastructure/databases/graph/kuzu/adapter.py @@ -185,6 +185,22 @@ class KuzuAdapter(GraphDBInterface): except FileNotFoundError: logger.warning(f"Kuzu S3 storage file not found: {self.db_path}") + async def count_edges(self) -> int: + query = """ + MATCH ()-[r]->() + RETURN COUNT(r); + """ + query_result = await self.query(query) + return query_result[0][0] + + async def count_nodes(self) -> int: + query = """ + MATCH (n) + RETURN COUNT(n); + """ + query_result = await self.query(query) + return query_result[0][0] + async def query(self, query: str, params: Optional[dict] = None) -> List[Tuple]: """ Execute a Kuzu query asynchronously with automatic reconnection. diff --git a/cognee/tests/test_kuzu.py b/cognee/tests/test_kuzu.py index 8749e42d0..e39edd06a 100644 --- a/cognee/tests/test_kuzu.py +++ b/cognee/tests/test_kuzu.py @@ -47,10 +47,31 @@ async def main(): pathlib.Path(__file__).parent, "test_data/Quantum_computers.txt" ) + from cognee.infrastructure.databases.graph import get_graph_engine + + graph_engine = await get_graph_engine() + + edges_count = await graph_engine.count_edges() + nodes_count = await graph_engine.count_nodes() + + assert edges_count == 0 and nodes_count == 0, "Kuzu graph database is not empty" + await cognee.add([explanation_file_path_quantum], dataset_name) + edges_count = await graph_engine.count_edges() + nodes_count = await graph_engine.count_nodes() + + assert edges_count == 0 and nodes_count == 0, ( + "Kuzu graph database should be empty before cognify" + ) + await cognee.cognify([dataset_name]) + edges_count = await graph_engine.count_edges() + nodes_count = await graph_engine.count_nodes() + + assert edges_count != 0 and nodes_count != 0, "Kuzu graph database should not be empty" + from cognee.infrastructure.databases.vector import get_vector_engine vector_engine = get_vector_engine() @@ -114,11 +135,11 @@ async def main(): assert not os.path.isdir(data_root_directory), "Local data files are not deleted" await cognee.prune.prune_system(metadata=True) - from cognee.infrastructure.databases.graph import get_graph_engine - graph_engine = await get_graph_engine() - nodes, edges = await graph_engine.get_graph_data() - assert len(nodes) == 0 and len(edges) == 0, "Kuzu graph database is not empty" + edges_count = await graph_engine.count_edges() + nodes_count = await graph_engine.count_nodes() + + assert edges_count == 0 and nodes_count == 0, "Kuzu graph database is not empty" finally: # Ensure cleanup even if tests fail From 9367fa5d03f42e3a1feb4d7d0de61cd1bb547fd0 Mon Sep 17 00:00:00 2001 From: Daulet Amirkhanov Date: Wed, 15 Oct 2025 16:39:48 +0100 Subject: [PATCH 24/61] Prior to search, check if knowledge graph is empty --- cognee/api/v1/search/search.py | 12 +++++++++++- cognee/modules/data/exceptions/__init__.py | 1 + cognee/modules/data/exceptions/exceptions.py | 10 ++++++++++ 3 files changed, 22 insertions(+), 1 deletion(-) diff --git a/cognee/api/v1/search/search.py b/cognee/api/v1/search/search.py index 0a9e76e96..32035e612 100644 --- a/cognee/api/v1/search/search.py +++ b/cognee/api/v1/search/search.py @@ -1,13 +1,14 @@ from uuid import UUID from typing import Union, Optional, List, Type +from cognee.infrastructure.databases.graph import get_graph_engine from cognee.modules.engine.models.node_set import NodeSet from cognee.modules.users.models import User from cognee.modules.search.types import SearchResult, SearchType, CombinedSearchResult from cognee.modules.users.methods import get_default_user from cognee.modules.search.methods import search as search_function from cognee.modules.data.methods import get_authorized_existing_datasets -from cognee.modules.data.exceptions import DatasetNotFoundError +from cognee.modules.data.exceptions import DatasetNotFoundError, SearchOnEmptyGraphError async def search( @@ -175,6 +176,15 @@ async def search( if not datasets: raise DatasetNotFoundError(message="No datasets found.") + graph_engine = await get_graph_engine() + edges_count = await graph_engine.count_edges() + nodes_count = await graph_engine.count_nodes() + + if nodes_count == 0 or edges_count == 0: + raise SearchOnEmptyGraphError( + message="Knowledge graph is empty, please ensure data is added and cognified." + ) + filtered_search_results = await search_function( query_text=query_text, query_type=query_type, diff --git a/cognee/modules/data/exceptions/__init__.py b/cognee/modules/data/exceptions/__init__.py index 54af81070..ba943634d 100644 --- a/cognee/modules/data/exceptions/__init__.py +++ b/cognee/modules/data/exceptions/__init__.py @@ -9,4 +9,5 @@ from .exceptions import ( UnauthorizedDataAccessError, DatasetNotFoundError, DatasetTypeError, + SearchOnEmptyGraphError, ) diff --git a/cognee/modules/data/exceptions/exceptions.py b/cognee/modules/data/exceptions/exceptions.py index ac3b68e64..c2921750a 100644 --- a/cognee/modules/data/exceptions/exceptions.py +++ b/cognee/modules/data/exceptions/exceptions.py @@ -35,6 +35,16 @@ class DatasetNotFoundError(CogneeValidationError): super().__init__(message, name, status_code) +class SearchOnEmptyGraphError(CogneeValidationError): + def __init__( + self, + message: str = "Knowledge graph is empty, please ensure data is added and cognified.", + name: str = "SearchOnEmptyGraphError", + status_code=status.HTTP_400_BAD_REQUEST, + ): + super().__init__(message, name, status_code) + + class DatasetTypeError(CogneeValidationError): def __init__( self, From ea4a93efb172a82754a342084aa95393a0f11759 Mon Sep 17 00:00:00 2001 From: Daulet Amirkhanov Date: Wed, 15 Oct 2025 16:57:53 +0100 Subject: [PATCH 25/61] Implement count_nodes and count_edges methods for Neo4j --- .../databases/graph/neo4j_driver/adapter.py | 16 +++++++++++ cognee/tests/test_neo4j.py | 27 +++++++++++++++---- 2 files changed, 38 insertions(+), 5 deletions(-) diff --git a/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py b/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py index 520295ed2..a61ab6f0b 100644 --- a/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py +++ b/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py @@ -87,6 +87,22 @@ class Neo4jAdapter(GraphDBInterface): async with self.driver.session(database=self.graph_database_name) as session: yield session + async def count_edges(self) -> int: + query = """ + MATCH ()-[r]->() + RETURN COUNT(r) as total_edges; + """ + query_result = await self.query(query) + return query_result[0]["total_edges"] + + async def count_nodes(self) -> int: + query = """ + MATCH (n) + RETURN COUNT(n) as total_nodes; + """ + query_result = await self.query(query) + return query_result[0]["total_nodes"] + @deadlock_retry() async def query( self, diff --git a/cognee/tests/test_neo4j.py b/cognee/tests/test_neo4j.py index c74b4ab65..11f6156bd 100644 --- a/cognee/tests/test_neo4j.py +++ b/cognee/tests/test_neo4j.py @@ -35,6 +35,15 @@ async def main(): explanation_file_path_nlp = os.path.join( pathlib.Path(__file__).parent, "test_data/Natural_language_processing.txt" ) + from cognee.infrastructure.databases.graph import get_graph_engine + + graph_engine = await get_graph_engine() + + edges_count = await graph_engine.count_edges() + nodes_count = await graph_engine.count_nodes() + + assert edges_count == 0 and nodes_count == 0, "Graph has to be empty" + await cognee.add([explanation_file_path_nlp], dataset_name) explanation_file_path_quantum = os.path.join( @@ -43,8 +52,18 @@ async def main(): await cognee.add([explanation_file_path_quantum], dataset_name) + edges_count = await graph_engine.count_edges() + nodes_count = await graph_engine.count_nodes() + + assert edges_count == 0 and nodes_count == 0, "Graph has to be empty before cognify" + await cognee.cognify([dataset_name]) + edges_count = await graph_engine.count_edges() + nodes_count = await graph_engine.count_nodes() + + assert edges_count != 0 and nodes_count != 0, "Graph shouldn't be empty" + from cognee.infrastructure.databases.vector import get_vector_engine vector_engine = get_vector_engine() @@ -117,11 +136,9 @@ async def main(): assert not os.path.isdir(data_root_directory), "Local data files are not deleted" await cognee.prune.prune_system(metadata=True) - from cognee.infrastructure.databases.graph import get_graph_engine - - graph_engine = await get_graph_engine() - nodes, edges = await graph_engine.get_graph_data() - assert len(nodes) == 0 and len(edges) == 0, "Neo4j graph database is not empty" + edges_count = await graph_engine.count_edges() + nodes_count = await graph_engine.count_nodes() + assert nodes_count == 0 and edges_count == 0, "Neo4j graph database is not empty" if __name__ == "__main__": From 96496f38ed1e4ce2dd63190c9cbf6a16338fbeb0 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Wed, 15 Oct 2025 18:08:18 +0200 Subject: [PATCH 26/61] refactor: Switch to using tenacity for rate limiting --- .../llm/anthropic/adapter.py | 28 ++++++---- .../litellm_instructor/llm/gemini/adapter.py | 22 ++++++-- .../llm/generic_llm_api/adapter.py | 22 ++++++-- .../litellm_instructor/llm/mistral/adapter.py | 54 ++++++------------- .../litellm_instructor/llm/ollama/adapter.py | 41 +++++++++++--- .../litellm_instructor/llm/openai/adapter.py | 53 +++++++++++++----- 6 files changed, 142 insertions(+), 78 deletions(-) diff --git a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic/adapter.py b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic/adapter.py index 2d88a8271..bf19d6e86 100644 --- a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic/adapter.py +++ b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic/adapter.py @@ -1,19 +1,24 @@ +import logging from typing import Type from pydantic import BaseModel +import litellm import instructor +from cognee.shared.logging_utils import get_logger +from tenacity import ( + retry, + stop_after_delay, + wait_exponential_jitter, + retry_if_not_exception_type, + before_sleep_log, +) -from cognee.infrastructure.llm.exceptions import MissingSystemPromptPathError from cognee.infrastructure.llm.structured_output_framework.litellm_instructor.llm.llm_interface import ( LLMInterface, ) -from cognee.infrastructure.llm.structured_output_framework.litellm_instructor.llm.rate_limiter import ( - rate_limit_async, - sleep_and_retry_async, -) - -from cognee.infrastructure.llm.LLMGateway import LLMGateway from cognee.infrastructure.llm.config import get_llm_config +logger = get_logger() + class AnthropicAdapter(LLMInterface): """ @@ -35,8 +40,13 @@ class AnthropicAdapter(LLMInterface): self.model = model self.max_completion_tokens = max_completion_tokens - @sleep_and_retry_async() - @rate_limit_async + @retry( + stop=stop_after_delay(128), + wait=wait_exponential_jitter(2, 128), + retry=retry_if_not_exception_type(litellm.exceptions.NotFoundError), + before_sleep=before_sleep_log(logger, logging.DEBUG), + reraise=True, + ) async def acreate_structured_output( self, text_input: str, system_prompt: str, response_model: Type[BaseModel] ) -> BaseModel: diff --git a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/gemini/adapter.py b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/gemini/adapter.py index 510d29ce8..1187e0cad 100644 --- a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/gemini/adapter.py +++ b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/gemini/adapter.py @@ -12,11 +12,18 @@ from cognee.infrastructure.llm.exceptions import ContentPolicyFilterError from cognee.infrastructure.llm.structured_output_framework.litellm_instructor.llm.llm_interface import ( LLMInterface, ) -from cognee.infrastructure.llm.structured_output_framework.litellm_instructor.llm.rate_limiter import ( - rate_limit_async, - sleep_and_retry_async, +import logging +from cognee.shared.logging_utils import get_logger +from tenacity import ( + retry, + stop_after_delay, + wait_exponential_jitter, + retry_if_not_exception_type, + before_sleep_log, ) +logger = get_logger() + class GeminiAdapter(LLMInterface): """ @@ -58,8 +65,13 @@ class GeminiAdapter(LLMInterface): self.aclient = instructor.from_litellm(litellm.acompletion, mode=instructor.Mode.JSON) - @sleep_and_retry_async() - @rate_limit_async + @retry( + stop=stop_after_delay(128), + wait=wait_exponential_jitter(2, 128), + retry=retry_if_not_exception_type(litellm.exceptions.NotFoundError), + before_sleep=before_sleep_log(logger, logging.DEBUG), + reraise=True, + ) async def acreate_structured_output( self, text_input: str, system_prompt: str, response_model: Type[BaseModel] ) -> BaseModel: diff --git a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/generic_llm_api/adapter.py b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/generic_llm_api/adapter.py index 917599d4d..8bbbaa2cc 100644 --- a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/generic_llm_api/adapter.py +++ b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/generic_llm_api/adapter.py @@ -12,11 +12,18 @@ from cognee.infrastructure.llm.exceptions import ContentPolicyFilterError from cognee.infrastructure.llm.structured_output_framework.litellm_instructor.llm.llm_interface import ( LLMInterface, ) -from cognee.infrastructure.llm.structured_output_framework.litellm_instructor.llm.rate_limiter import ( - rate_limit_async, - sleep_and_retry_async, +import logging +from cognee.shared.logging_utils import get_logger +from tenacity import ( + retry, + stop_after_delay, + wait_exponential_jitter, + retry_if_not_exception_type, + before_sleep_log, ) +logger = get_logger() + class GenericAPIAdapter(LLMInterface): """ @@ -58,8 +65,13 @@ class GenericAPIAdapter(LLMInterface): self.aclient = instructor.from_litellm(litellm.acompletion, mode=instructor.Mode.JSON) - @sleep_and_retry_async() - @rate_limit_async + @retry( + stop=stop_after_delay(128), + wait=wait_exponential_jitter(2, 128), + retry=retry_if_not_exception_type(litellm.exceptions.NotFoundError), + before_sleep=before_sleep_log(logger, logging.DEBUG), + reraise=True, + ) async def acreate_structured_output( self, text_input: str, system_prompt: str, response_model: Type[BaseModel] ) -> BaseModel: diff --git a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/mistral/adapter.py b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/mistral/adapter.py index c4e51b70b..78a3cbff5 100644 --- a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/mistral/adapter.py +++ b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/mistral/adapter.py @@ -1,20 +1,23 @@ import litellm import instructor from pydantic import BaseModel -from typing import Type, Optional -from litellm import acompletion, JSONSchemaValidationError +from typing import Type +from litellm import JSONSchemaValidationError from cognee.shared.logging_utils import get_logger from cognee.modules.observability.get_observe import get_observe -from cognee.infrastructure.llm.exceptions import MissingSystemPromptPathError from cognee.infrastructure.llm.structured_output_framework.litellm_instructor.llm.llm_interface import ( LLMInterface, ) -from cognee.infrastructure.llm.LLMGateway import LLMGateway from cognee.infrastructure.llm.config import get_llm_config -from cognee.infrastructure.llm.structured_output_framework.litellm_instructor.llm.rate_limiter import ( - rate_limit_async, - sleep_and_retry_async, + +import logging +from tenacity import ( + retry, + stop_after_delay, + wait_exponential_jitter, + retry_if_not_exception_type, + before_sleep_log, ) logger = get_logger() @@ -47,8 +50,13 @@ class MistralAdapter(LLMInterface): api_key=get_llm_config().llm_api_key, ) - @sleep_and_retry_async() - @rate_limit_async + @retry( + stop=stop_after_delay(128), + wait=wait_exponential_jitter(2, 128), + retry=retry_if_not_exception_type(litellm.exceptions.NotFoundError), + before_sleep=before_sleep_log(logger, logging.DEBUG), + reraise=True, + ) async def acreate_structured_output( self, text_input: str, system_prompt: str, response_model: Type[BaseModel] ) -> BaseModel: @@ -99,31 +107,3 @@ class MistralAdapter(LLMInterface): logger.error(f"Schema validation failed: {str(e)}") logger.debug(f"Raw response: {e.raw_response}") raise ValueError(f"Response failed schema validation: {str(e)}") - - def show_prompt(self, text_input: str, system_prompt: str) -> str: - """ - Format and display the prompt for a user query. - - Parameters: - ----------- - - text_input (str): Input text from the user to be included in the prompt. - - system_prompt (str): The system prompt that will be shown alongside the user input. - - Returns: - -------- - - str: The formatted prompt string combining system prompt and user input. - """ - if not text_input: - text_input = "No user input provided." - if not system_prompt: - raise MissingSystemPromptPathError() - - system_prompt = LLMGateway.read_query_prompt(system_prompt) - - formatted_prompt = ( - f"""System Prompt:\n{system_prompt}\n\nUser Input:\n{text_input}\n""" - if system_prompt - else None - ) - - return formatted_prompt diff --git a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/ollama/adapter.py b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/ollama/adapter.py index 314cb79d8..9c3d185aa 100644 --- a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/ollama/adapter.py +++ b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/ollama/adapter.py @@ -1,4 +1,6 @@ import base64 +import litellm +import logging import instructor from typing import Type from openai import OpenAI @@ -7,11 +9,17 @@ from pydantic import BaseModel from cognee.infrastructure.llm.structured_output_framework.litellm_instructor.llm.llm_interface import ( LLMInterface, ) -from cognee.infrastructure.llm.structured_output_framework.litellm_instructor.llm.rate_limiter import ( - rate_limit_async, - sleep_and_retry_async, -) from cognee.infrastructure.files.utils.open_data_file import open_data_file +from cognee.shared.logging_utils import get_logger +from tenacity import ( + retry, + stop_after_delay, + wait_exponential_jitter, + retry_if_not_exception_type, + before_sleep_log, +) + +logger = get_logger() class OllamaAPIAdapter(LLMInterface): @@ -47,8 +55,13 @@ class OllamaAPIAdapter(LLMInterface): OpenAI(base_url=self.endpoint, api_key=self.api_key), mode=instructor.Mode.JSON ) - @sleep_and_retry_async() - @rate_limit_async + @retry( + stop=stop_after_delay(128), + wait=wait_exponential_jitter(2, 128), + retry=retry_if_not_exception_type(litellm.exceptions.NotFoundError), + before_sleep=before_sleep_log(logger, logging.DEBUG), + reraise=True, + ) async def acreate_structured_output( self, text_input: str, system_prompt: str, response_model: Type[BaseModel] ) -> BaseModel: @@ -90,7 +103,13 @@ class OllamaAPIAdapter(LLMInterface): return response - @rate_limit_async + @retry( + stop=stop_after_delay(128), + wait=wait_exponential_jitter(2, 128), + retry=retry_if_not_exception_type(litellm.exceptions.NotFoundError), + before_sleep=before_sleep_log(logger, logging.DEBUG), + reraise=True, + ) async def create_transcript(self, input_file: str) -> str: """ Generate an audio transcript from a user query. @@ -123,7 +142,13 @@ class OllamaAPIAdapter(LLMInterface): return transcription.text - @rate_limit_async + @retry( + stop=stop_after_delay(128), + wait=wait_exponential_jitter(2, 128), + retry=retry_if_not_exception_type(litellm.exceptions.NotFoundError), + before_sleep=before_sleep_log(logger, logging.DEBUG), + reraise=True, + ) async def transcribe_image(self, input_file: str) -> str: """ Transcribe content from an image using base64 encoding. diff --git a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/adapter.py b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/adapter.py index 527f64d75..8877c2bdf 100644 --- a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/adapter.py +++ b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/adapter.py @@ -7,6 +7,15 @@ from openai import ContentFilterFinishReasonError from litellm.exceptions import ContentPolicyViolationError from instructor.core import InstructorRetryException +import logging +from tenacity import ( + retry, + stop_after_delay, + wait_exponential_jitter, + retry_if_not_exception_type, + before_sleep_log, +) + from cognee.infrastructure.llm.structured_output_framework.litellm_instructor.llm.llm_interface import ( LLMInterface, ) @@ -14,19 +23,13 @@ from cognee.infrastructure.llm.exceptions import ( ContentPolicyFilterError, ) from cognee.infrastructure.files.utils.open_data_file import open_data_file -from cognee.infrastructure.llm.structured_output_framework.litellm_instructor.llm.rate_limiter import ( - rate_limit_async, - rate_limit_sync, - sleep_and_retry_async, - sleep_and_retry_sync, -) from cognee.modules.observability.get_observe import get_observe from cognee.shared.logging_utils import get_logger -observe = get_observe() - logger = get_logger() +observe = get_observe() + class OpenAIAdapter(LLMInterface): """ @@ -97,8 +100,13 @@ class OpenAIAdapter(LLMInterface): self.fallback_endpoint = fallback_endpoint @observe(as_type="generation") - @sleep_and_retry_async() - @rate_limit_async + @retry( + stop=stop_after_delay(128), + wait=wait_exponential_jitter(2, 128), + retry=retry_if_not_exception_type(litellm.exceptions.NotFoundError), + before_sleep=before_sleep_log(logger, logging.DEBUG), + reraise=True, + ) async def acreate_structured_output( self, text_input: str, system_prompt: str, response_model: Type[BaseModel] ) -> BaseModel: @@ -186,8 +194,13 @@ class OpenAIAdapter(LLMInterface): ) from error @observe - @sleep_and_retry_sync() - @rate_limit_sync + @retry( + stop=stop_after_delay(128), + wait=wait_exponential_jitter(2, 128), + retry=retry_if_not_exception_type(litellm.exceptions.NotFoundError), + before_sleep=before_sleep_log(logger, logging.DEBUG), + reraise=True, + ) def create_structured_output( self, text_input: str, system_prompt: str, response_model: Type[BaseModel] ) -> BaseModel: @@ -231,7 +244,13 @@ class OpenAIAdapter(LLMInterface): max_retries=self.MAX_RETRIES, ) - @rate_limit_async + @retry( + stop=stop_after_delay(128), + wait=wait_exponential_jitter(2, 128), + retry=retry_if_not_exception_type(litellm.exceptions.NotFoundError), + before_sleep=before_sleep_log(logger, logging.DEBUG), + reraise=True, + ) async def create_transcript(self, input): """ Generate an audio transcript from a user query. @@ -263,7 +282,13 @@ class OpenAIAdapter(LLMInterface): return transcription - @rate_limit_async + @retry( + stop=stop_after_delay(128), + wait=wait_exponential_jitter(2, 128), + retry=retry_if_not_exception_type(litellm.exceptions.NotFoundError), + before_sleep=before_sleep_log(logger, logging.DEBUG), + reraise=True, + ) async def transcribe_image(self, input) -> BaseModel: """ Generate a transcription of an image from a user query. From dede5fa6fdc5c42e6ad36826c72f8c62d91eacae Mon Sep 17 00:00:00 2001 From: Daulet Amirkhanov Date: Wed, 15 Oct 2025 17:09:13 +0100 Subject: [PATCH 27/61] add unit tests for empty graph check on search --- cognee/tests/unit/api/test_search.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 cognee/tests/unit/api/test_search.py diff --git a/cognee/tests/unit/api/test_search.py b/cognee/tests/unit/api/test_search.py new file mode 100644 index 000000000..aff9e5d38 --- /dev/null +++ b/cognee/tests/unit/api/test_search.py @@ -0,0 +1,23 @@ +import pytest +import cognee +from cognee.modules.data.exceptions import SearchOnEmptyGraphError + + +@pytest.mark.asyncio +async def test_empty_search_raises_SearchOnEmptyGraphError_on_empty_graph(): + await cognee.prune.prune_data() + await cognee.prune.prune_system(metadata=True) + await cognee.add("Sample input") + with pytest.raises(SearchOnEmptyGraphError): + await cognee.search("Sample query") + + +async def test_empty_search_doesnt_raise_SearchOnEmptyGraphError(): + await cognee.prune.prune_data() + await cognee.prune.prune_system(metadata=True) + await cognee.add("Sample input") + await cognee.cognify() + try: + await cognee.search("Sample query") + except SearchOnEmptyGraphError: + pytest.fail("Should not raise SearchOnEmptyGraphError when data was added and cognified") From 9e38a30c4945e1d5f3596550bd32ab26463cca03 Mon Sep 17 00:00:00 2001 From: Daulet Amirkhanov Date: Wed, 15 Oct 2025 17:20:45 +0100 Subject: [PATCH 28/61] refactor: keep only count_nodes --- cognee/api/v1/search/search.py | 3 +-- .../databases/graph/graph_db_interface.py | 4 ---- .../infrastructure/databases/graph/kuzu/adapter.py | 8 -------- .../databases/graph/neo4j_driver/adapter.py | 8 -------- cognee/tests/test_kuzu.py | 14 ++++---------- cognee/tests/test_neo4j.py | 13 ++++--------- examples/python/dynamic_steps_example.py | 2 +- 7 files changed, 10 insertions(+), 42 deletions(-) diff --git a/cognee/api/v1/search/search.py b/cognee/api/v1/search/search.py index 32035e612..880a57b99 100644 --- a/cognee/api/v1/search/search.py +++ b/cognee/api/v1/search/search.py @@ -177,10 +177,9 @@ async def search( raise DatasetNotFoundError(message="No datasets found.") graph_engine = await get_graph_engine() - edges_count = await graph_engine.count_edges() nodes_count = await graph_engine.count_nodes() - if nodes_count == 0 or edges_count == 0: + if nodes_count == 0: raise SearchOnEmptyGraphError( message="Knowledge graph is empty, please ensure data is added and cognified." ) diff --git a/cognee/infrastructure/databases/graph/graph_db_interface.py b/cognee/infrastructure/databases/graph/graph_db_interface.py index abfdff784..a4542cefe 100644 --- a/cognee/infrastructure/databases/graph/graph_db_interface.py +++ b/cognee/infrastructure/databases/graph/graph_db_interface.py @@ -163,10 +163,6 @@ class GraphDBInterface(ABC): async def count_nodes(self) -> int: raise NotImplementedError - @abstractmethod - async def count_edges(self) -> int: - raise NotImplementedError - @abstractmethod async def query(self, query: str, params: dict) -> List[Any]: """ diff --git a/cognee/infrastructure/databases/graph/kuzu/adapter.py b/cognee/infrastructure/databases/graph/kuzu/adapter.py index a31726c9a..04c163efa 100644 --- a/cognee/infrastructure/databases/graph/kuzu/adapter.py +++ b/cognee/infrastructure/databases/graph/kuzu/adapter.py @@ -185,14 +185,6 @@ class KuzuAdapter(GraphDBInterface): except FileNotFoundError: logger.warning(f"Kuzu S3 storage file not found: {self.db_path}") - async def count_edges(self) -> int: - query = """ - MATCH ()-[r]->() - RETURN COUNT(r); - """ - query_result = await self.query(query) - return query_result[0][0] - async def count_nodes(self) -> int: query = """ MATCH (n) diff --git a/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py b/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py index a61ab6f0b..ac19069f4 100644 --- a/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py +++ b/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py @@ -87,14 +87,6 @@ class Neo4jAdapter(GraphDBInterface): async with self.driver.session(database=self.graph_database_name) as session: yield session - async def count_edges(self) -> int: - query = """ - MATCH ()-[r]->() - RETURN COUNT(r) as total_edges; - """ - query_result = await self.query(query) - return query_result[0]["total_edges"] - async def count_nodes(self) -> int: query = """ MATCH (n) diff --git a/cognee/tests/test_kuzu.py b/cognee/tests/test_kuzu.py index e39edd06a..c07a51104 100644 --- a/cognee/tests/test_kuzu.py +++ b/cognee/tests/test_kuzu.py @@ -51,26 +51,21 @@ async def main(): graph_engine = await get_graph_engine() - edges_count = await graph_engine.count_edges() nodes_count = await graph_engine.count_nodes() - assert edges_count == 0 and nodes_count == 0, "Kuzu graph database is not empty" + assert nodes_count == 0, "Kuzu graph database is not empty" await cognee.add([explanation_file_path_quantum], dataset_name) - edges_count = await graph_engine.count_edges() nodes_count = await graph_engine.count_nodes() - assert edges_count == 0 and nodes_count == 0, ( - "Kuzu graph database should be empty before cognify" - ) + assert nodes_count == 0, "Kuzu graph database should be empty before cognify" await cognee.cognify([dataset_name]) - edges_count = await graph_engine.count_edges() nodes_count = await graph_engine.count_nodes() - assert edges_count != 0 and nodes_count != 0, "Kuzu graph database should not be empty" + assert nodes_count != 0, "Kuzu graph database should not be empty" from cognee.infrastructure.databases.vector import get_vector_engine @@ -136,10 +131,9 @@ async def main(): await cognee.prune.prune_system(metadata=True) - edges_count = await graph_engine.count_edges() nodes_count = await graph_engine.count_nodes() - assert edges_count == 0 and nodes_count == 0, "Kuzu graph database is not empty" + assert nodes_count == 0, "Kuzu graph database is not empty" finally: # Ensure cleanup even if tests fail diff --git a/cognee/tests/test_neo4j.py b/cognee/tests/test_neo4j.py index 11f6156bd..6f1fcf975 100644 --- a/cognee/tests/test_neo4j.py +++ b/cognee/tests/test_neo4j.py @@ -39,10 +39,9 @@ async def main(): graph_engine = await get_graph_engine() - edges_count = await graph_engine.count_edges() nodes_count = await graph_engine.count_nodes() - assert edges_count == 0 and nodes_count == 0, "Graph has to be empty" + assert nodes_count == 0, "Graph has to be empty" await cognee.add([explanation_file_path_nlp], dataset_name) @@ -51,18 +50,15 @@ async def main(): ) await cognee.add([explanation_file_path_quantum], dataset_name) - - edges_count = await graph_engine.count_edges() nodes_count = await graph_engine.count_nodes() - assert edges_count == 0 and nodes_count == 0, "Graph has to be empty before cognify" + assert nodes_count == 0, "Graph has to be empty before cognify" await cognee.cognify([dataset_name]) - edges_count = await graph_engine.count_edges() nodes_count = await graph_engine.count_nodes() - assert edges_count != 0 and nodes_count != 0, "Graph shouldn't be empty" + assert nodes_count != 0, "Graph shouldn't be empty" from cognee.infrastructure.databases.vector import get_vector_engine @@ -136,9 +132,8 @@ async def main(): assert not os.path.isdir(data_root_directory), "Local data files are not deleted" await cognee.prune.prune_system(metadata=True) - edges_count = await graph_engine.count_edges() nodes_count = await graph_engine.count_nodes() - assert nodes_count == 0 and edges_count == 0, "Neo4j graph database is not empty" + assert nodes_count == 0, "Neo4j graph database is not empty" if __name__ == "__main__": diff --git a/examples/python/dynamic_steps_example.py b/examples/python/dynamic_steps_example.py index bce2ea8be..5ff68cecc 100644 --- a/examples/python/dynamic_steps_example.py +++ b/examples/python/dynamic_steps_example.py @@ -199,7 +199,7 @@ if __name__ == "__main__": "prune_data": rebuild_kg, "prune_system": rebuild_kg, "add_text": rebuild_kg, - "cognify": rebuild_kg, + "cognify": False, "graph_metrics": rebuild_kg, "retriever": retrieve, } From a854e4f42689d7c7fb567c6e4b62443fbb818b19 Mon Sep 17 00:00:00 2001 From: Daulet Amirkhanov Date: Wed, 15 Oct 2025 17:22:51 +0100 Subject: [PATCH 29/61] chore: update GraphDBInterface to not throw NotImplementedError for count_nodes() --- cognee/infrastructure/databases/graph/graph_db_interface.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cognee/infrastructure/databases/graph/graph_db_interface.py b/cognee/infrastructure/databases/graph/graph_db_interface.py index a4542cefe..d7542eac6 100644 --- a/cognee/infrastructure/databases/graph/graph_db_interface.py +++ b/cognee/infrastructure/databases/graph/graph_db_interface.py @@ -161,7 +161,8 @@ class GraphDBInterface(ABC): @abstractmethod async def count_nodes(self) -> int: - raise NotImplementedError + logger.warning("count_nodes is not implemented") + return 1 # dummy value to not fail search() @abstractmethod async def query(self, query: str, params: dict) -> List[Any]: From c9a3f483987ea78a8ba1f2b199541ff362548638 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Wed, 15 Oct 2025 18:26:01 +0200 Subject: [PATCH 30/61] fix: Resolve issue with data element incremental loading for multiple datasets --- cognee/modules/pipelines/operations/run_tasks_data_item.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/cognee/modules/pipelines/operations/run_tasks_data_item.py b/cognee/modules/pipelines/operations/run_tasks_data_item.py index 94fc631a8..152e72d7f 100644 --- a/cognee/modules/pipelines/operations/run_tasks_data_item.py +++ b/cognee/modules/pipelines/operations/run_tasks_data_item.py @@ -115,9 +115,8 @@ async def run_tasks_data_item_incremental( data_point = ( await session.execute(select(Data).filter(Data.id == data_id)) ).scalar_one_or_none() - data_point.pipeline_status[pipeline_name] = { - str(dataset.id): DataItemStatus.DATA_ITEM_PROCESSING_COMPLETED - } + status_for_pipeline = data_point.pipeline_status.setdefault(pipeline_name, {}) + status_for_pipeline[str(dataset.id)] = DataItemStatus.DATA_ITEM_PROCESSING_COMPLETED await session.merge(data_point) await session.commit() From 38406a0ab1b3d4d47f8d9fb4e95b4612cd3ce117 Mon Sep 17 00:00:00 2001 From: Daulet Amirkhanov Date: Wed, 15 Oct 2025 17:32:48 +0100 Subject: [PATCH 31/61] chore: remove memgraph from cognee repo --- .../databases/graph/get_graph_engine.py | 2 +- .../graph/memgraph/memgraph_adapter.py | 1116 ----------------- cognee/tests/test_memgraph.py | 105 -- notebooks/neptune-analytics-example.ipynb | 82 +- 4 files changed, 42 insertions(+), 1263 deletions(-) delete mode 100644 cognee/infrastructure/databases/graph/memgraph/memgraph_adapter.py delete mode 100644 cognee/tests/test_memgraph.py diff --git a/cognee/infrastructure/databases/graph/get_graph_engine.py b/cognee/infrastructure/databases/graph/get_graph_engine.py index 1861aa15c..1ea61d29f 100644 --- a/cognee/infrastructure/databases/graph/get_graph_engine.py +++ b/cognee/infrastructure/databases/graph/get_graph_engine.py @@ -162,5 +162,5 @@ def create_graph_engine( raise EnvironmentError( f"Unsupported graph database provider: {graph_database_provider}. " - f"Supported providers are: {', '.join(list(supported_databases.keys()) + ['neo4j', 'kuzu', 'kuzu-remote', 'memgraph', 'neptune', 'neptune_analytics'])}" + f"Supported providers are: {', '.join(list(supported_databases.keys()) + ['neo4j', 'kuzu', 'kuzu-remote', 'neptune', 'neptune_analytics'])}" ) diff --git a/cognee/infrastructure/databases/graph/memgraph/memgraph_adapter.py b/cognee/infrastructure/databases/graph/memgraph/memgraph_adapter.py deleted file mode 100644 index 3612e3277..000000000 --- a/cognee/infrastructure/databases/graph/memgraph/memgraph_adapter.py +++ /dev/null @@ -1,1116 +0,0 @@ -"""Memgraph Adapter for Graph Database""" - -import json -from cognee.shared.logging_utils import get_logger, ERROR -import asyncio -from textwrap import dedent -from typing import Optional, Any, List, Dict, Type, Tuple -from contextlib import asynccontextmanager -from uuid import UUID -from neo4j import AsyncSession -from neo4j import AsyncGraphDatabase -from neo4j.exceptions import Neo4jError -from cognee.infrastructure.engine import DataPoint -from cognee.infrastructure.databases.graph.graph_db_interface import GraphDBInterface -from cognee.modules.storage.utils import JSONEncoder -from cognee.infrastructure.databases.exceptions.exceptions import NodesetFilterNotSupportedError - -logger = get_logger("MemgraphAdapter", level=ERROR) - - -class MemgraphAdapter(GraphDBInterface): - """ - Handles interaction with a Memgraph database through various graph operations. - - Public methods include: - - get_session - - query - - has_node - - add_node - - add_nodes - - extract_node - - extract_nodes - - delete_node - - delete_nodes - - has_edge - - has_edges - - add_edge - - add_edges - - get_edges - - get_disconnected_nodes - - get_predecessors - - get_successors - - get_neighbours - - get_connections - - remove_connection_to_predecessors_of - - remove_connection_to_successors_of - - delete_graph - - serialize_properties - - get_model_independent_graph_data - - get_graph_data - - get_nodeset_subgraph - - get_filtered_graph_data - - get_node_labels_string - - get_relationship_labels_string - - get_graph_metrics - """ - - def __init__( - self, - graph_database_url: str, - graph_database_username: Optional[str] = None, - graph_database_password: Optional[str] = None, - driver: Optional[Any] = None, - ): - # Only use auth if both username and password are provided - auth = None - if graph_database_username and graph_database_password: - auth = (graph_database_username, graph_database_password) - - self.driver = driver or AsyncGraphDatabase.driver( - graph_database_url, - auth=auth, - max_connection_lifetime=120, - ) - - @asynccontextmanager - async def get_session(self) -> AsyncSession: - """ - Manage a session with the database, yielding the session for use in operations. - """ - async with self.driver.session() as session: - yield session - - async def query( - self, - query: str, - params: Optional[Dict[str, Any]] = None, - ) -> List[Dict[str, Any]]: - """ - Execute a provided query on the Memgraph database and return the results. - - Parameters: - ----------- - - - query (str): The Cypher query to be executed against the database. - - params (Optional[Dict[str, Any]]): Optional parameters to be used in the query. - (default None) - - Returns: - -------- - - - List[Dict[str, Any]]: A list of dictionaries representing the result set of the - query. - """ - try: - async with self.get_session() as session: - result = await session.run(query, params) - data = await result.data() - return data - except Neo4jError as error: - logger.error("Memgraph query error: %s", error, exc_info=True) - raise error - - async def has_node(self, node_id: str) -> bool: - """ - Determine if a node with the given ID exists in the database. - - Parameters: - ----------- - - - node_id (str): The ID of the node to check for existence. - - Returns: - -------- - - - bool: True if the node exists; otherwise, False. - """ - results = await self.query( - """ - MATCH (n) - WHERE n.id = $node_id - RETURN COUNT(n) > 0 AS node_exists - """, - {"node_id": node_id}, - ) - return results[0]["node_exists"] if len(results) > 0 else False - - async def add_node(self, node: DataPoint): - """ - Add a new node to the database with specified properties. - - Parameters: - ----------- - - - node (DataPoint): The DataPoint object representing the node to add. - - Returns: - -------- - - The result of the node addition, including its internal ID and node ID. - """ - serialized_properties = self.serialize_properties(node.model_dump()) - - query = """ - MERGE (node {id: $node_id}) - ON CREATE SET node:$node_label, node += $properties, node.updated_at = timestamp() - ON MATCH SET node:$node_label, node += $properties, node.updated_at = timestamp() - RETURN ID(node) AS internal_id, node.id AS nodeId - """ - - params = { - "node_id": str(node.id), - "node_label": type(node).__name__, - "properties": serialized_properties, - } - return await self.query(query, params) - - async def add_nodes(self, nodes: list[DataPoint]) -> None: - """ - Add multiple nodes to the database in a single operation. - - Parameters: - ----------- - - - nodes (list[DataPoint]): A list of DataPoint objects representing the nodes to - add. - - Returns: - -------- - - - None: None. - """ - query = """ - UNWIND $nodes AS node - MERGE (n {id: node.node_id}) - ON CREATE SET n:node.label, n += node.properties, n.updated_at = timestamp() - ON MATCH SET n:node.label, n += node.properties, n.updated_at = timestamp() - RETURN ID(n) AS internal_id, n.id AS nodeId - """ - - nodes = [ - { - "node_id": str(node.id), - "label": type(node).__name__, - "properties": self.serialize_properties(node.model_dump()), - } - for node in nodes - ] - - results = await self.query(query, dict(nodes=nodes)) - return results - - async def extract_node(self, node_id: str): - """ - Retrieve a single node based on its ID. - - Parameters: - ----------- - - - node_id (str): The ID of the node to retrieve. - - Returns: - -------- - - The node corresponding to the provided ID, or None if not found. - """ - results = await self.extract_nodes([node_id]) - - return results[0] if len(results) > 0 else None - - async def extract_nodes(self, node_ids: List[str]): - """ - Retrieve multiple nodes based on their IDs. - - Parameters: - ----------- - - - node_ids (List[str]): A list of IDs for the nodes to retrieve. - - Returns: - -------- - - A list of nodes corresponding to the provided IDs. - """ - query = """ - UNWIND $node_ids AS id - MATCH (node {id: id}) - RETURN node""" - - params = {"node_ids": node_ids} - - results = await self.query(query, params) - - return [result["node"] for result in results] - - async def delete_node(self, node_id: str): - """ - Delete a node from the database based on its ID. - - Parameters: - ----------- - - - node_id (str): The ID of the node to delete. - - Returns: - -------- - - None. - """ - sanitized_id = node_id.replace(":", "_") - - query = "MATCH (node: {{id: $node_id}}) DETACH DELETE node" - params = {"node_id": sanitized_id} - - return await self.query(query, params) - - async def delete_nodes(self, node_ids: list[str]) -> None: - """ - Delete multiple nodes from the database based on their IDs. - - Parameters: - ----------- - - - node_ids (list[str]): A list of IDs for the nodes to delete. - - Returns: - -------- - - - None: None. - """ - query = """ - UNWIND $node_ids AS id - MATCH (node {id: id}) - DETACH DELETE node""" - - params = {"node_ids": node_ids} - - return await self.query(query, params) - - async def has_edge(self, from_node: UUID, to_node: UUID, edge_label: str) -> bool: - """ - Check if a directed edge exists between two nodes identified by their IDs. - - Parameters: - ----------- - - - from_node (UUID): The ID of the source node. - - to_node (UUID): The ID of the target node. - - edge_label (str): The label of the edge to check. - - Returns: - -------- - - - bool: True if the edge exists; otherwise, False. - """ - query = """ - MATCH (from_node)-[relationship]->(to_node) - WHERE from_node.id = $from_node_id AND to_node.id = $to_node_id AND type(relationship) = $edge_label - RETURN COUNT(relationship) > 0 AS edge_exists - """ - - params = { - "from_node_id": str(from_node), - "to_node_id": str(to_node), - "edge_label": edge_label, - } - - records = await self.query(query, params) - return records[0]["edge_exists"] if records else False - - async def has_edges(self, edges): - """ - Check for the existence of multiple edges based on provided criteria. - - Parameters: - ----------- - - - edges: A list of edges to verify existence for. - - Returns: - -------- - - A list of boolean values indicating the existence of each edge. - """ - query = """ - UNWIND $edges AS edge - MATCH (a)-[r]->(b) - WHERE id(a) = edge.from_node AND id(b) = edge.to_node AND type(r) = edge.relationship_name - RETURN edge.from_node AS from_node, edge.to_node AS to_node, edge.relationship_name AS relationship_name, count(r) > 0 AS edge_exists - """ - - try: - params = { - "edges": [ - { - "from_node": str(edge[0]), - "to_node": str(edge[1]), - "relationship_name": edge[2], - } - for edge in edges - ], - } - - results = await self.query(query, params) - return [result["edge_exists"] for result in results] - except Neo4jError as error: - logger.error("Memgraph query error: %s", error, exc_info=True) - raise error - - async def add_edge( - self, - from_node: UUID, - to_node: UUID, - relationship_name: str, - edge_properties: Optional[Dict[str, Any]] = None, - ): - """ - Add a directed edge between two nodes with optional properties. - - Parameters: - ----------- - - - from_node (UUID): The ID of the source node. - - to_node (UUID): The ID of the target node. - - relationship_name (str): The type/label of the relationship to create. - - edge_properties (Optional[Dict[str, Any]]): Optional properties associated with - the edge. (default None) - - Returns: - -------- - - The result of the edge addition operation, including relationship details. - """ - - exists = await asyncio.gather(self.has_node(str(from_node)), self.has_node(str(to_node))) - - if not all(exists): - return None - - serialized_properties = self.serialize_properties(edge_properties or {}) - - query = dedent( - f"""\ - MATCH (from_node {{id: $from_node}}), - (to_node {{id: $to_node}}) - WHERE from_node IS NOT NULL AND to_node IS NOT NULL - MERGE (from_node)-[r:{relationship_name}]->(to_node) - ON CREATE SET r += $properties, r.updated_at = timestamp() - ON MATCH SET r += $properties, r.updated_at = timestamp() - RETURN r - """ - ) - - params = { - "from_node": str(from_node), - "to_node": str(to_node), - "relationship_name": relationship_name, - "properties": serialized_properties, - } - - return await self.query(query, params) - - async def add_edges(self, edges: list[tuple[str, str, str, dict[str, Any]]]) -> None: - """ - Batch add multiple edges between nodes, enforcing specified relationships. - - Parameters: - ----------- - - - edges (list[tuple[str, str, str, dict[str, Any]]): A list of tuples containing - specifications for each edge to add. - - Returns: - -------- - - - None: None. - """ - query = """ - UNWIND $edges AS edge - MATCH (from_node {id: edge.from_node}) - MATCH (to_node {id: edge.to_node}) - CALL merge.relationship( - from_node, - edge.relationship_name, - { - source_node_id: edge.from_node, - target_node_id: edge.to_node - }, - edge.properties, - to_node, - {} - ) YIELD rel - RETURN rel""" - - edges = [ - { - "from_node": str(edge[0]), - "to_node": str(edge[1]), - "relationship_name": edge[2], - "properties": { - **(edge[3] if edge[3] else {}), - "source_node_id": str(edge[0]), - "target_node_id": str(edge[1]), - }, - } - for edge in edges - ] - - try: - results = await self.query(query, dict(edges=edges)) - return results - except Neo4jError as error: - logger.error("Memgraph query error: %s", error, exc_info=True) - raise error - - async def get_edges(self, node_id: str): - """ - Retrieve all edges connected to a specific node identified by its ID. - - Parameters: - ----------- - - - node_id (str): The ID of the node for which to retrieve connected edges. - - Returns: - -------- - - A list of tuples representing the edges connected to the node. - """ - query = """ - MATCH (n {id: $node_id})-[r]-(m) - RETURN n, r, m - """ - - results = await self.query(query, dict(node_id=node_id)) - - return [ - (result["n"]["id"], result["m"]["id"], {"relationship_name": result["r"][1]}) - for result in results - ] - - async def get_disconnected_nodes(self) -> list[str]: - """ - Identify nodes in the graph that do not belong to the largest connected component. - - Returns: - -------- - - - list[str]: A list of IDs representing the disconnected nodes. - """ - query = """ - // Step 1: Collect all nodes - MATCH (n) - WITH COLLECT(n) AS nodes - - // Step 2: Find all connected components - WITH nodes - CALL { - WITH nodes - UNWIND nodes AS startNode - MATCH path = (startNode)-[*]-(connectedNode) - WITH startNode, COLLECT(DISTINCT connectedNode) AS component - RETURN component - } - - // Step 3: Aggregate components - WITH COLLECT(component) AS components - - // Step 4: Identify the largest connected component - UNWIND components AS component - WITH component - ORDER BY SIZE(component) DESC - LIMIT 1 - WITH component AS largestComponent - - // Step 5: Find nodes not in the largest connected component - MATCH (n) - WHERE NOT n IN largestComponent - RETURN COLLECT(ID(n)) AS ids - """ - - results = await self.query(query) - return results[0]["ids"] if len(results) > 0 else [] - - async def get_predecessors(self, node_id: str, edge_label: str = None) -> list[str]: - """ - Retrieve all predecessors of a node based on its ID and optional edge label. - - Parameters: - ----------- - - - node_id (str): The ID of the node to find predecessors for. - - edge_label (str): Optional edge label to filter predecessors. (default None) - - Returns: - -------- - - - list[str]: A list of predecessor node IDs. - """ - if edge_label is not None: - query = """ - MATCH (node)<-[r]-(predecessor) - WHERE node.id = $node_id AND type(r) = $edge_label - RETURN predecessor - """ - - results = await self.query( - query, - dict( - node_id=node_id, - edge_label=edge_label, - ), - ) - - return [result["predecessor"] for result in results] - else: - query = """ - MATCH (node)<-[r]-(predecessor) - WHERE node.id = $node_id - RETURN predecessor - """ - - results = await self.query( - query, - dict( - node_id=node_id, - ), - ) - - return [result["predecessor"] for result in results] - - async def get_successors(self, node_id: str, edge_label: str = None) -> list[str]: - """ - Retrieve all successors of a node based on its ID and optional edge label. - - Parameters: - ----------- - - - node_id (str): The ID of the node to find successors for. - - edge_label (str): Optional edge label to filter successors. (default None) - - Returns: - -------- - - - list[str]: A list of successor node IDs. - """ - if edge_label is not None: - query = """ - MATCH (node)-[r]->(successor) - WHERE node.id = $node_id AND type(r) = $edge_label - RETURN successor - """ - - results = await self.query( - query, - dict( - node_id=node_id, - edge_label=edge_label, - ), - ) - - return [result["successor"] for result in results] - else: - query = """ - MATCH (node)-[r]->(successor) - WHERE node.id = $node_id - RETURN successor - """ - - results = await self.query( - query, - dict( - node_id=node_id, - ), - ) - - return [result["successor"] for result in results] - - async def get_neighbors(self, node_id: str) -> List[Dict[str, Any]]: - """ - Get both predecessors and successors of a node. - - Parameters: - ----------- - - - node_id (str): The ID of the node to find neighbors for. - - Returns: - -------- - - - List[Dict[str, Any]]: A combined list of neighbor node IDs. - """ - predecessors, successors = await asyncio.gather( - self.get_predecessors(node_id), self.get_successors(node_id) - ) - - return predecessors + successors - - async def get_node(self, node_id: str) -> Optional[Dict[str, Any]]: - """Get a single node by ID.""" - query = """ - MATCH (node {id: $node_id}) - RETURN node - """ - results = await self.query(query, {"node_id": node_id}) - return results[0]["node"] if results else None - - async def get_nodes(self, node_ids: List[str]) -> List[Dict[str, Any]]: - """Get multiple nodes by their IDs.""" - query = """ - UNWIND $node_ids AS id - MATCH (node {id: id}) - RETURN node - """ - results = await self.query(query, {"node_ids": node_ids}) - return [result["node"] for result in results] - - async def get_connections(self, node_id: UUID) -> list: - """ - Retrieve connections for a given node, including both predecessors and successors. - - Parameters: - ----------- - - - node_id (UUID): The ID of the node for which to retrieve connections. - - Returns: - -------- - - - list: A list of connections associated with the node. - """ - predecessors_query = """ - MATCH (node)<-[relation]-(neighbour) - WHERE node.id = $node_id - RETURN neighbour, relation, node - """ - successors_query = """ - MATCH (node)-[relation]->(neighbour) - WHERE node.id = $node_id - RETURN node, relation, neighbour - """ - - predecessors, successors = await asyncio.gather( - self.query(predecessors_query, dict(node_id=str(node_id))), - self.query(successors_query, dict(node_id=str(node_id))), - ) - - connections = [] - - for neighbour in predecessors: - neighbour = neighbour["relation"] - connections.append((neighbour[0], {"relationship_name": neighbour[1]}, neighbour[2])) - - for neighbour in successors: - neighbour = neighbour["relation"] - connections.append((neighbour[0], {"relationship_name": neighbour[1]}, neighbour[2])) - - return connections - - async def remove_connection_to_predecessors_of( - self, node_ids: list[str], edge_label: str - ) -> None: - """ - Remove specified connections to the predecessors of the given node IDs. - - Parameters: - ----------- - - - node_ids (list[str]): A list of node IDs from which to remove predecessor - connections. - - edge_label (str): The label of the edges to remove. - - Returns: - -------- - - - None: None. - """ - query = f""" - UNWIND $node_ids AS nid - MATCH (node {id: nid})-[r]->(predecessor) - WHERE type(r) = $edge_label - DELETE r; - """ - - params = {"node_ids": node_ids, "edge_label": edge_label} - - return await self.query(query, params) - - async def remove_connection_to_successors_of( - self, node_ids: list[str], edge_label: str - ) -> None: - """ - Remove specified connections to the successors of the given node IDs. - - Parameters: - ----------- - - - node_ids (list[str]): A list of node IDs from which to remove successor - connections. - - edge_label (str): The label of the edges to remove. - - Returns: - -------- - - - None: None. - """ - query = f""" - UNWIND $node_ids AS id - MATCH (node:`{id}`)<-[r:{edge_label}]-(successor) - DELETE r; - """ - - params = {"node_ids": node_ids} - - return await self.query(query, params) - - async def delete_graph(self): - """ - Completely delete the graph from the database, removing all nodes and edges. - - Returns: - -------- - - None. - """ - query = """MATCH (node) - DETACH DELETE node;""" - - return await self.query(query) - - def serialize_properties(self, properties=dict()): - """ - Convert property values to a suitable representation for storage. - - Parameters: - ----------- - - - properties: A dictionary of properties to serialize. (default dict()) - - Returns: - -------- - - A dictionary of serialized properties. - """ - serialized_properties = {} - - for property_key, property_value in properties.items(): - if isinstance(property_value, UUID): - serialized_properties[property_key] = str(property_value) - continue - - if isinstance(property_value, dict): - serialized_properties[property_key] = json.dumps(property_value, cls=JSONEncoder) - continue - - serialized_properties[property_key] = property_value - - return serialized_properties - - async def get_model_independent_graph_data(self): - """ - Fetch nodes and relationships without any specific model filtering. - - Returns: - -------- - - A tuple containing nodes and edges as collections. - """ - query_nodes = "MATCH (n) RETURN collect(n) AS nodes" - nodes = await self.query(query_nodes) - - query_edges = "MATCH (n)-[r]->(m) RETURN collect([n, r, m]) AS elements" - edges = await self.query(query_edges) - - return (nodes, edges) - - async def get_graph_data(self): - """ - Retrieve all nodes and edges from the graph, including their properties. - - Returns: - -------- - - A tuple containing lists of nodes and edges. - """ - query = "MATCH (n) RETURN ID(n) AS id, labels(n) AS labels, properties(n) AS properties" - - result = await self.query(query) - - nodes = [ - ( - record["id"], - record["properties"], - ) - for record in result - ] - - query = """ - MATCH (n)-[r]->(m) - RETURN ID(n) AS source, ID(m) AS target, TYPE(r) AS type, properties(r) AS properties - """ - result = await self.query(query) - edges = [ - ( - record["source"], - record["target"], - record["type"], - record["properties"], - ) - for record in result - ] - - return (nodes, edges) - - async def get_nodeset_subgraph( - self, node_type: Type[Any], node_name: List[str] - ) -> Tuple[List[Tuple[int, dict]], List[Tuple[int, int, str, dict]]]: - """ - Throw an error indicating that node set filtering is not supported. - - Parameters: - ----------- - - - node_type (Type[Any]): The type of nodes to filter. - - node_name (List[str]): A list of node names to filter. - """ - raise NodesetFilterNotSupportedError - - async def get_filtered_graph_data(self, attribute_filters): - """ - Fetch nodes and relationships based on specified attribute filters. - - Parameters: - ----------- - - - attribute_filters: A list of criteria to filter nodes and relationships. - - Returns: - -------- - - A tuple containing filtered nodes and edges. - """ - where_clauses = [] - for attribute, values in attribute_filters[0].items(): - values_str = ", ".join( - f"'{value}'" if isinstance(value, str) else str(value) for value in values - ) - where_clauses.append(f"n.{attribute} IN [{values_str}]") - - where_clause = " AND ".join(where_clauses) - - query_nodes = f""" - MATCH (n) - WHERE {where_clause} - RETURN ID(n) AS id, labels(n) AS labels, properties(n) AS properties - """ - result_nodes = await self.query(query_nodes) - - nodes = [ - ( - record["id"], - record["properties"], - ) - for record in result_nodes - ] - - query_edges = f""" - MATCH (n)-[r]->(m) - WHERE {where_clause} AND {where_clause.replace("n.", "m.")} - RETURN ID(n) AS source, ID(m) AS target, TYPE(r) AS type, properties(r) AS properties - """ - result_edges = await self.query(query_edges) - - edges = [ - ( - record["source"], - record["target"], - record["type"], - record["properties"], - ) - for record in result_edges - ] - - return (nodes, edges) - - async def get_node_labels_string(self): - """ - Retrieve a string representation of all unique node labels in the graph. - - Returns: - -------- - - A string containing unique node labels. - """ - node_labels_query = """ - MATCH (n) - WITH DISTINCT labels(n) AS labelList - UNWIND labelList AS label - RETURN collect(DISTINCT label) AS labels; - """ - node_labels_result = await self.query(node_labels_query) - node_labels = node_labels_result[0]["labels"] if node_labels_result else [] - - if not node_labels: - raise ValueError("No node labels found in the database") - - node_labels_str = "[" + ", ".join(f"'{label}'" for label in node_labels) + "]" - return node_labels_str - - async def get_relationship_labels_string(self): - """ - Retrieve a string representation of all unique relationship types in the graph. - - Returns: - -------- - - A string containing unique relationship types. - """ - relationship_types_query = ( - "MATCH ()-[r]->() RETURN collect(DISTINCT type(r)) AS relationships;" - ) - relationship_types_result = await self.query(relationship_types_query) - relationship_types = ( - relationship_types_result[0]["relationships"] if relationship_types_result else [] - ) - - if not relationship_types: - raise ValueError("No relationship types found in the database.") - - relationship_types_undirected_str = ( - "{" - + ", ".join(f"{rel}" + ": {orientation: 'UNDIRECTED'}" for rel in relationship_types) - + "}" - ) - return relationship_types_undirected_str - - async def get_graph_metrics(self, include_optional=False): - """ - Calculate and return various metrics of the graph, including mandatory and optional - metrics. - - Parameters: - ----------- - - - include_optional: Specify whether to include optional metrics in the results. - (default False) - - Returns: - -------- - - A dictionary containing calculated graph metrics. - """ - - try: - # Basic metrics - node_count = await self.query("MATCH (n) RETURN count(n)") - edge_count = await self.query("MATCH ()-[r]->() RETURN count(r)") - num_nodes = node_count[0][0] if node_count else 0 - num_edges = edge_count[0][0] if edge_count else 0 - - # Calculate mandatory metrics - mandatory_metrics = { - "num_nodes": num_nodes, - "num_edges": num_edges, - "mean_degree": (2 * num_edges) / num_nodes if num_nodes > 0 else 0, - "edge_density": (num_edges) / (num_nodes * (num_nodes - 1)) if num_nodes > 1 else 0, - } - - # Calculate connected components - components_query = """ - MATCH (n:Node) - WITH n.id AS node_id - MATCH path = (n)-[:EDGE*0..]-() - WITH COLLECT(DISTINCT node_id) AS component - RETURN COLLECT(component) AS components - """ - components_result = await self.query(components_query) - component_sizes = ( - [len(comp) for comp in components_result[0][0]] if components_result else [] - ) - - mandatory_metrics.update( - { - "num_connected_components": len(component_sizes), - "sizes_of_connected_components": component_sizes, - } - ) - - if include_optional: - # Self-loops - self_loops_query = """ - MATCH (n:Node)-[r:EDGE]->(n) - RETURN COUNT(r) - """ - self_loops = await self.query(self_loops_query) - num_selfloops = self_loops[0][0] if self_loops else 0 - - # Shortest paths (simplified for Kuzu) - paths_query = """ - MATCH (n:Node), (m:Node) - WHERE n.id < m.id - MATCH path = (n)-[:EDGE*]-(m) - RETURN MIN(LENGTH(path)) AS length - """ - paths = await self.query(paths_query) - path_lengths = [p[0] for p in paths if p[0] is not None] - - # Local clustering coefficient - clustering_query = """ - /// Step 1: Get each node with its neighbors and degree - MATCH (n:Node)-[:EDGE]-(neighbor) - WITH n, COLLECT(DISTINCT neighbor) AS neighbors, COUNT(DISTINCT neighbor) AS degree - - // Step 2: Pair up neighbors and check if they are connected - UNWIND neighbors AS n1 - UNWIND neighbors AS n2 - WITH n, degree, n1, n2 - WHERE id(n1) < id(n2) // avoid duplicate pairs - - // Step 3: Use OPTIONAL MATCH to see if n1 and n2 are connected - OPTIONAL MATCH (n1)-[:EDGE]-(n2) - WITH n, degree, COUNT(n2) AS triangle_count - - // Step 4: Compute local clustering coefficient - WITH n, degree, - CASE WHEN degree <= 1 THEN 0.0 - ELSE (1.0 * triangle_count) / (degree * (degree - 1) / 2.0) - END AS local_cc - - // Step 5: Compute average - RETURN AVG(local_cc) AS avg_clustering_coefficient - """ - clustering = await self.query(clustering_query) - - optional_metrics = { - "num_selfloops": num_selfloops, - "diameter": max(path_lengths) if path_lengths else -1, - "avg_shortest_path_length": sum(path_lengths) / len(path_lengths) - if path_lengths - else -1, - "avg_clustering": clustering[0][0] if clustering and clustering[0][0] else -1, - } - else: - optional_metrics = { - "num_selfloops": -1, - "diameter": -1, - "avg_shortest_path_length": -1, - "avg_clustering": -1, - } - - return {**mandatory_metrics, **optional_metrics} - - except Exception as e: - logger.error(f"Failed to get graph metrics: {e}") - return { - "num_nodes": 0, - "num_edges": 0, - "mean_degree": 0, - "edge_density": 0, - "num_connected_components": 0, - "sizes_of_connected_components": [], - "num_selfloops": -1, - "diameter": -1, - "avg_shortest_path_length": -1, - "avg_clustering": -1, - } diff --git a/cognee/tests/test_memgraph.py b/cognee/tests/test_memgraph.py deleted file mode 100644 index d0d968fc4..000000000 --- a/cognee/tests/test_memgraph.py +++ /dev/null @@ -1,105 +0,0 @@ -import os - -import pathlib -import cognee -from cognee.infrastructure.files.storage import get_storage_config -from cognee.modules.search.operations import get_history -from cognee.modules.users.methods import get_default_user -from cognee.shared.logging_utils import get_logger -from cognee.modules.search.types import SearchType - - -logger = get_logger() - - -async def main(): - cognee.config.set_graph_database_provider("memgraph") - data_directory_path = str( - pathlib.Path( - os.path.join(pathlib.Path(__file__).parent, ".data_storage/test_memgraph") - ).resolve() - ) - cognee.config.data_root_directory(data_directory_path) - cognee_directory_path = str( - pathlib.Path( - os.path.join(pathlib.Path(__file__).parent, ".cognee_system/test_memgraph") - ).resolve() - ) - cognee.config.system_root_directory(cognee_directory_path) - - await cognee.prune.prune_data() - await cognee.prune.prune_system(metadata=True) - - dataset_name = "cs_explanations" - - explanation_file_path_nlp = os.path.join( - pathlib.Path(__file__).parent, "test_data/Natural_language_processing.txt" - ) - await cognee.add([explanation_file_path_nlp], dataset_name) - - explanation_file_path_quantum = os.path.join( - pathlib.Path(__file__).parent, "test_data/Quantum_computers.txt" - ) - - await cognee.add([explanation_file_path_quantum], dataset_name) - - await cognee.cognify([dataset_name]) - - from cognee.infrastructure.databases.vector import get_vector_engine - - vector_engine = get_vector_engine() - random_node = (await vector_engine.search("Entity_name", "Quantum computer"))[0] - random_node_name = random_node.payload["text"] - - search_results = await cognee.search( - query_type=SearchType.GRAPH_COMPLETION, query_text=random_node_name - ) - assert len(search_results) != 0, "The search results list is empty." - print("\n\nExtracted sentences are:\n") - for result in search_results: - print(f"{result}\n") - - search_results = await cognee.search(query_type=SearchType.CHUNKS, query_text=random_node_name) - assert len(search_results) != 0, "The search results list is empty." - print("\n\nExtracted chunks are:\n") - for result in search_results: - print(f"{result}\n") - - search_results = await cognee.search( - query_type=SearchType.SUMMARIES, query_text=random_node_name - ) - assert len(search_results) != 0, "Query related summaries don't exist." - print("\nExtracted results are:\n") - for result in search_results: - print(f"{result}\n") - - search_results = await cognee.search( - query_type=SearchType.NATURAL_LANGUAGE, - query_text=f"Find nodes connected to node with name {random_node_name}", - ) - assert len(search_results) != 0, "Query related natural language don't exist." - print("\nExtracted results are:\n") - for result in search_results: - print(f"{result}\n") - - user = await get_default_user() - history = await get_history(user.id) - - assert len(history) == 8, "Search history is not correct." - - await cognee.prune.prune_data() - data_root_directory = get_storage_config()["data_root_directory"] - assert not os.path.isdir(data_root_directory), "Local data files are not deleted" - - await cognee.prune.prune_system(metadata=True) - from cognee.infrastructure.databases.graph import get_graph_engine - - graph_engine = await get_graph_engine() - nodes, edges = await graph_engine.get_graph_data() - assert len(nodes) == 0 and len(edges) == 0, "Memgraph graph database is not empty" - - -if __name__ == "__main__": - import asyncio - - asyncio.run(main()) diff --git a/notebooks/neptune-analytics-example.ipynb b/notebooks/neptune-analytics-example.ipynb index e80ea4dcb..c85ccf58a 100644 --- a/notebooks/neptune-analytics-example.ipynb +++ b/notebooks/neptune-analytics-example.ipynb @@ -83,16 +83,16 @@ ] }, { - "metadata": {}, "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "import os\n", "import pathlib\n", "from cognee import config, add, cognify, search, SearchType, prune, visualize_graph\n", "from dotenv import load_dotenv" - ], - "outputs": [], - "execution_count": null + ] }, { "cell_type": "markdown", @@ -106,7 +106,9 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ "# load environment variables from file .env\n", "load_dotenv()\n", @@ -145,9 +147,7 @@ " \"vector_db_url\": f\"neptune-graph://{graph_identifier}\", # Neptune Analytics endpoint with the format neptune-graph://\n", " }\n", ")" - ], - "outputs": [], - "execution_count": null + ] }, { "cell_type": "markdown", @@ -159,19 +159,19 @@ ] }, { - "metadata": {}, "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "# Prune data and system metadata before running, only if we want \"fresh\" state.\n", "await prune.prune_data()\n", "await prune.prune_system(metadata=True)" - ], - "outputs": [], - "execution_count": null + ] }, { - "metadata": {}, "cell_type": "markdown", + "metadata": {}, "source": [ "## Setup data and cognify\n", "\n", @@ -180,7 +180,9 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ "# Add sample text to the dataset\n", "sample_text_1 = \"\"\"Neptune Analytics is a memory-optimized graph database engine for analytics. With Neptune\n", @@ -205,9 +207,7 @@ "\n", "# Cognify the text data.\n", "await cognify([dataset_name])" - ], - "outputs": [], - "execution_count": null + ] }, { "cell_type": "markdown", @@ -215,14 +215,16 @@ "source": [ "## Graph Memory visualization\n", "\n", - "Initialize Memgraph as a Graph Memory store and save to .artefacts/graph_visualization.html\n", + "Initialize Neptune as a Graph Memory store and save to .artefacts/graph_visualization.html\n", "\n", "![visualization](./neptune_analytics_demo.png)" ] }, { "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ "# Get a graphistry url (Register for a free account at https://www.graphistry.com)\n", "# url = await render_graph()\n", @@ -235,9 +237,7 @@ " ).resolve()\n", ")\n", "await visualize_graph(graph_file_path)" - ], - "outputs": [], - "execution_count": null + ] }, { "cell_type": "markdown", @@ -250,19 +250,19 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ "# Completion query that uses graph data to form context.\n", "graph_completion = await search(query_text=\"What is Neptune Analytics?\", query_type=SearchType.GRAPH_COMPLETION)\n", "print(\"\\nGraph completion result is:\")\n", "print(graph_completion)" - ], - "outputs": [], - "execution_count": null + ] }, { - "metadata": {}, "cell_type": "markdown", + "metadata": {}, "source": [ "## SEARCH: RAG Completion\n", "\n", @@ -271,19 +271,19 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ "# Completion query that uses document chunks to form context.\n", "rag_completion = await search(query_text=\"What is Neptune Analytics?\", query_type=SearchType.RAG_COMPLETION)\n", "print(\"\\nRAG Completion result is:\")\n", "print(rag_completion)" - ], - "outputs": [], - "execution_count": null + ] }, { - "metadata": {}, "cell_type": "markdown", + "metadata": {}, "source": [ "## SEARCH: Graph Insights\n", "\n", @@ -291,8 +291,10 @@ ] }, { - "metadata": {}, "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "# Search graph insights\n", "insights_results = await search(query_text=\"Neptune Analytics\", query_type=SearchType.GRAPH_COMPLETION)\n", @@ -302,13 +304,11 @@ " tgt_node = result[2].get(\"name\", result[2][\"type\"])\n", " relationship = result[1].get(\"relationship_name\", \"__relationship__\")\n", " print(f\"- {src_node} -[{relationship}]-> {tgt_node}\")" - ], - "outputs": [], - "execution_count": null + ] }, { - "metadata": {}, "cell_type": "markdown", + "metadata": {}, "source": [ "## SEARCH: Entity Summaries\n", "\n", @@ -316,8 +316,10 @@ ] }, { - "metadata": {}, "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "# Query all summaries related to query.\n", "summaries = await search(query_text=\"Neptune Analytics\", query_type=SearchType.SUMMARIES)\n", @@ -326,13 +328,11 @@ " type = summary[\"type\"]\n", " text = summary[\"text\"]\n", " print(f\"- {type}: {text}\")" - ], - "outputs": [], - "execution_count": null + ] }, { - "metadata": {}, "cell_type": "markdown", + "metadata": {}, "source": [ "## SEARCH: Chunks\n", "\n", @@ -340,8 +340,10 @@ ] }, { - "metadata": {}, "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "chunks = await search(query_text=\"Neptune Analytics\", query_type=SearchType.CHUNKS)\n", "print(\"\\nChunk results are:\")\n", @@ -349,9 +351,7 @@ " type = chunk[\"type\"]\n", " text = chunk[\"text\"]\n", " print(f\"- {type}: {text}\")" - ], - "outputs": [], - "execution_count": null + ] } ], "metadata": { From 2a6256634e2829a63e11f2c5de9f7d0ad7dac44f Mon Sep 17 00:00:00 2001 From: Daulet Amirkhanov Date: Wed, 15 Oct 2025 17:35:46 +0100 Subject: [PATCH 32/61] chore: revert temporary change to dynamic_steps_example.py --- examples/python/dynamic_steps_example.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/python/dynamic_steps_example.py b/examples/python/dynamic_steps_example.py index 5ff68cecc..bce2ea8be 100644 --- a/examples/python/dynamic_steps_example.py +++ b/examples/python/dynamic_steps_example.py @@ -199,7 +199,7 @@ if __name__ == "__main__": "prune_data": rebuild_kg, "prune_system": rebuild_kg, "add_text": rebuild_kg, - "cognify": False, + "cognify": rebuild_kg, "graph_metrics": rebuild_kg, "retriever": retrieve, } From 99dc35f23e26e4cd2016f50a6c783f6a0a1749e1 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Wed, 15 Oct 2025 20:01:09 +0200 Subject: [PATCH 33/61] fix: resolve issue with neo4j metrics test --- .../tasks/descriptive_metrics/metrics_test_utils.py | 2 -- .../tasks/descriptive_metrics/neo4j_metrics_test.py | 11 ++++++++--- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/cognee/tests/tasks/descriptive_metrics/metrics_test_utils.py b/cognee/tests/tasks/descriptive_metrics/metrics_test_utils.py index 911d9c33b..579a499fd 100644 --- a/cognee/tests/tasks/descriptive_metrics/metrics_test_utils.py +++ b/cognee/tests/tasks/descriptive_metrics/metrics_test_utils.py @@ -1,7 +1,6 @@ from typing import List from cognee.infrastructure.engine import DataPoint from cognee.tasks.storage.add_data_points import add_data_points -from cognee.infrastructure.databases.graph.get_graph_engine import create_graph_engine import cognee from cognee.infrastructure.databases.graph import get_graph_engine import json @@ -64,7 +63,6 @@ async def create_connected_test_graph(): async def get_metrics(provider: str, include_optional=True): - create_graph_engine.cache_clear() cognee.config.set_graph_database_provider(provider) graph_engine = await get_graph_engine() await graph_engine.delete_graph() diff --git a/cognee/tests/tasks/descriptive_metrics/neo4j_metrics_test.py b/cognee/tests/tasks/descriptive_metrics/neo4j_metrics_test.py index 2ca9e9f7e..8d7a6ab02 100644 --- a/cognee/tests/tasks/descriptive_metrics/neo4j_metrics_test.py +++ b/cognee/tests/tasks/descriptive_metrics/neo4j_metrics_test.py @@ -1,7 +1,12 @@ -from cognee.tests.tasks.descriptive_metrics.metrics_test_utils import assert_metrics import asyncio +async def main(): + from cognee.tests.tasks.descriptive_metrics.metrics_test_utils import assert_metrics + + await assert_metrics(provider="neo4j", include_optional=False) + await assert_metrics(provider="neo4j", include_optional=True) + + if __name__ == "__main__": - asyncio.run(assert_metrics(provider="neo4j", include_optional=False)) - asyncio.run(assert_metrics(provider="neo4j", include_optional=True)) + asyncio.run(main()) From 2fb06e07299a53c2e5412cbe30e851b26e97b783 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Wed, 15 Oct 2025 20:18:48 +0200 Subject: [PATCH 34/61] refactor: forwarding of data batch size rework --- cognee/api/v1/add/add.py | 2 ++ cognee/api/v1/cognify/cognify.py | 2 ++ cognee/modules/pipelines/operations/pipeline.py | 5 ++++- cognee/modules/pipelines/operations/run_tasks.py | 12 +++++------- 4 files changed, 13 insertions(+), 8 deletions(-) diff --git a/cognee/api/v1/add/add.py b/cognee/api/v1/add/add.py index 65394f1ec..b5a8a230f 100644 --- a/cognee/api/v1/add/add.py +++ b/cognee/api/v1/add/add.py @@ -41,6 +41,7 @@ async def add( extraction_rules: Optional[Dict[str, Any]] = None, tavily_config: Optional[BaseModel] = None, soup_crawler_config: Optional[BaseModel] = None, + data_batch_size: Optional[int] = 20, ): """ Add data to Cognee for knowledge graph processing. @@ -235,6 +236,7 @@ async def add( vector_db_config=vector_db_config, graph_db_config=graph_db_config, incremental_loading=incremental_loading, + data_batch_size=data_batch_size, ): pipeline_run_info = run_info diff --git a/cognee/api/v1/cognify/cognify.py b/cognee/api/v1/cognify/cognify.py index c3045f00a..ab5e4a023 100644 --- a/cognee/api/v1/cognify/cognify.py +++ b/cognee/api/v1/cognify/cognify.py @@ -51,6 +51,7 @@ async def cognify( incremental_loading: bool = True, custom_prompt: Optional[str] = None, temporal_cognify: bool = False, + data_batch_size: int = 20, ): """ Transform ingested data into a structured knowledge graph. @@ -228,6 +229,7 @@ async def cognify( graph_db_config=graph_db_config, incremental_loading=incremental_loading, pipeline_name="cognify_pipeline", + data_batch_size=data_batch_size, ) diff --git a/cognee/modules/pipelines/operations/pipeline.py b/cognee/modules/pipelines/operations/pipeline.py index b59a171f7..9d61235c1 100644 --- a/cognee/modules/pipelines/operations/pipeline.py +++ b/cognee/modules/pipelines/operations/pipeline.py @@ -35,6 +35,7 @@ async def run_pipeline( vector_db_config: dict = None, graph_db_config: dict = None, incremental_loading: bool = False, + data_batch_size: int = 20, ): validate_pipeline_tasks(tasks) await setup_and_check_environment(vector_db_config, graph_db_config) @@ -50,6 +51,7 @@ async def run_pipeline( pipeline_name=pipeline_name, context={"dataset": dataset}, incremental_loading=incremental_loading, + data_batch_size=data_batch_size, ): yield run_info @@ -62,6 +64,7 @@ async def run_pipeline_per_dataset( pipeline_name: str = "custom_pipeline", context: dict = None, incremental_loading=False, + data_batch_size: int = 20, ): # Will only be used if ENABLE_BACKEND_ACCESS_CONTROL is set to True await set_database_global_context_variables(dataset.id, dataset.owner_id) @@ -77,7 +80,7 @@ async def run_pipeline_per_dataset( return pipeline_run = run_tasks( - tasks, dataset.id, data, user, pipeline_name, context, incremental_loading + tasks, dataset.id, data, user, pipeline_name, context, incremental_loading, data_batch_size ) async for pipeline_run_info in pipeline_run: diff --git a/cognee/modules/pipelines/operations/run_tasks.py b/cognee/modules/pipelines/operations/run_tasks.py index 2e0055384..18eaf8011 100644 --- a/cognee/modules/pipelines/operations/run_tasks.py +++ b/cognee/modules/pipelines/operations/run_tasks.py @@ -24,14 +24,11 @@ from cognee.modules.pipelines.operations import ( log_pipeline_run_complete, log_pipeline_run_error, ) -from .run_tasks_with_telemetry import run_tasks_with_telemetry from .run_tasks_data_item import run_tasks_data_item from ..tasks.task import Task logger = get_logger("run_tasks(tasks: [Task], data)") -# TODO: See if this parameter should be configurable as input for run_tasks itself -DOCUMENT_BATCH_SIZE = 10 def override_run_tasks(new_gen): @@ -62,6 +59,7 @@ async def run_tasks( pipeline_name: str = "unknown_pipeline", context: dict = None, incremental_loading: bool = False, + data_batch_size: int = 20, ): if not user: user = await get_default_user() @@ -93,12 +91,12 @@ async def run_tasks( # Create and gather batches of async tasks of data items that will run the pipeline for the data item results = [] - for start in range(0, len(data), DOCUMENT_BATCH_SIZE): - document_batch = data[start : start + DOCUMENT_BATCH_SIZE] + for start in range(0, len(data), data_batch_size): + data_batch = data[start : start + data_batch_size] data_item_tasks = [ asyncio.create_task( - _run_tasks_data_item( + run_tasks_data_item( data_item, dataset, tasks, @@ -110,7 +108,7 @@ async def run_tasks( incremental_loading, ) ) - for data_item in document_batch + for data_item in data_batch ] results.extend(await asyncio.gather(*data_item_tasks)) From 3a9022a26c1e26b1b70867b441096b266f884cc5 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Wed, 15 Oct 2025 20:22:29 +0200 Subject: [PATCH 35/61] refactor: Rename batch size for tasks to chunk batch size --- cognee/api/v1/cognify/cognify.py | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/cognee/api/v1/cognify/cognify.py b/cognee/api/v1/cognify/cognify.py index d29d8c939..e0f6253d8 100644 --- a/cognee/api/v1/cognify/cognify.py +++ b/cognee/api/v1/cognify/cognify.py @@ -44,7 +44,7 @@ async def cognify( graph_model: BaseModel = KnowledgeGraph, chunker=TextChunker, chunk_size: int = None, - batch_size: int = None, + chunk_batch_size: int = None, config: Config = None, vector_db_config: dict = None, graph_db_config: dict = None, @@ -106,7 +106,7 @@ async def cognify( Formula: min(embedding_max_completion_tokens, llm_max_completion_tokens // 2) Default limits: ~512-8192 tokens depending on models. Smaller chunks = more granular but potentially fragmented knowledge. - batch_size: Number of chunks to be processed in a single batch in Cognify tasks. + chunk_batch_size: Number of chunks to be processed in a single batch in Cognify tasks. vector_db_config: Custom vector database configuration for embeddings storage. graph_db_config: Custom graph database configuration for relationship storage. run_in_background: If True, starts processing asynchronously and returns immediately. @@ -212,7 +212,7 @@ async def cognify( if temporal_cognify: tasks = await get_temporal_tasks( - user=user, chunker=chunker, chunk_size=chunk_size, batch_size=batch_size + user=user, chunker=chunker, chunk_size=chunk_size, chunk_batch_size=chunk_batch_size ) else: tasks = await get_default_tasks( @@ -222,7 +222,7 @@ async def cognify( chunk_size=chunk_size, config=config, custom_prompt=custom_prompt, - batch_size=batch_size, + chunk_batch_size=chunk_batch_size, ) # By calling get pipeline executor we get a function that will have the run_pipeline run in the background or a function that we will need to wait for @@ -248,7 +248,7 @@ async def get_default_tasks( # TODO: Find out a better way to do this (Boris's chunk_size: int = None, config: Config = None, custom_prompt: Optional[str] = None, - batch_size: int = 100, + chunk_batch_size: int = 100, ) -> list[Task]: if config is None: ontology_config = get_ontology_env_config() @@ -267,8 +267,8 @@ async def get_default_tasks( # TODO: Find out a better way to do this (Boris's "ontology_config": {"ontology_resolver": get_default_ontology_resolver()} } - if batch_size is None: - batch_size = 100 + if chunk_batch_size is None: + chunk_batch_size = 100 default_tasks = [ Task(classify_documents), @@ -283,20 +283,20 @@ async def get_default_tasks( # TODO: Find out a better way to do this (Boris's graph_model=graph_model, config=config, custom_prompt=custom_prompt, - task_config={"batch_size": batch_size}, + task_config={"batch_size": chunk_batch_size}, ), # Generate knowledge graphs from the document chunks. Task( summarize_text, - task_config={"batch_size": batch_size}, + task_config={"batch_size": chunk_batch_size}, ), - Task(add_data_points, task_config={"batch_size": batch_size}), + Task(add_data_points, task_config={"batch_size": chunk_batch_size}), ] return default_tasks async def get_temporal_tasks( - user: User = None, chunker=TextChunker, chunk_size: int = None, batch_size: int = 10 + user: User = None, chunker=TextChunker, chunk_size: int = None, chunk_batch_size: int = 10 ) -> list[Task]: """ Builds and returns a list of temporal processing tasks to be executed in sequence. @@ -313,13 +313,13 @@ async def get_temporal_tasks( user (User, optional): The user requesting task execution, used for permission checks. chunker (Callable, optional): A text chunking function/class to split documents. Defaults to TextChunker. chunk_size (int, optional): Maximum token size per chunk. If not provided, uses system default. - batch_size (int, optional): Number of chunks to process in a single batch in Cognify + chunk_batch_size (int, optional): Number of chunks to process in a single batch in Cognify Returns: list[Task]: A list of Task objects representing the temporal processing pipeline. """ - if batch_size is None: - batch_size = 10 + if chunk_batch_size is None: + chunk_batch_size = 10 temporal_tasks = [ Task(classify_documents), @@ -329,9 +329,9 @@ async def get_temporal_tasks( max_chunk_size=chunk_size or get_max_chunk_tokens(), chunker=chunker, ), - Task(extract_events_and_timestamps, task_config={"batch_size": batch_size}), + Task(extract_events_and_timestamps, task_config={"batch_size": chunk_batch_size}), Task(extract_knowledge_graph_from_events), - Task(add_data_points, task_config={"batch_size": batch_size}), + Task(add_data_points, task_config={"batch_size": chunk_batch_size}), ] return temporal_tasks From a210bd59054dd353675589c63e57fe9d7349b766 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Wed, 15 Oct 2025 20:24:36 +0200 Subject: [PATCH 36/61] refactor: rename chunk_batch_size to chunks_per_batch --- cognee/api/v1/cognify/cognify.py | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/cognee/api/v1/cognify/cognify.py b/cognee/api/v1/cognify/cognify.py index e0f6253d8..1d5c36a3c 100644 --- a/cognee/api/v1/cognify/cognify.py +++ b/cognee/api/v1/cognify/cognify.py @@ -44,7 +44,7 @@ async def cognify( graph_model: BaseModel = KnowledgeGraph, chunker=TextChunker, chunk_size: int = None, - chunk_batch_size: int = None, + chunks_per_batch: int = None, config: Config = None, vector_db_config: dict = None, graph_db_config: dict = None, @@ -106,7 +106,7 @@ async def cognify( Formula: min(embedding_max_completion_tokens, llm_max_completion_tokens // 2) Default limits: ~512-8192 tokens depending on models. Smaller chunks = more granular but potentially fragmented knowledge. - chunk_batch_size: Number of chunks to be processed in a single batch in Cognify tasks. + chunks_per_batch: Number of chunks to be processed in a single batch in Cognify tasks. vector_db_config: Custom vector database configuration for embeddings storage. graph_db_config: Custom graph database configuration for relationship storage. run_in_background: If True, starts processing asynchronously and returns immediately. @@ -212,7 +212,7 @@ async def cognify( if temporal_cognify: tasks = await get_temporal_tasks( - user=user, chunker=chunker, chunk_size=chunk_size, chunk_batch_size=chunk_batch_size + user=user, chunker=chunker, chunk_size=chunk_size, chunks_per_batch=chunks_per_batch ) else: tasks = await get_default_tasks( @@ -222,7 +222,7 @@ async def cognify( chunk_size=chunk_size, config=config, custom_prompt=custom_prompt, - chunk_batch_size=chunk_batch_size, + chunks_per_batch=chunks_per_batch, ) # By calling get pipeline executor we get a function that will have the run_pipeline run in the background or a function that we will need to wait for @@ -248,7 +248,7 @@ async def get_default_tasks( # TODO: Find out a better way to do this (Boris's chunk_size: int = None, config: Config = None, custom_prompt: Optional[str] = None, - chunk_batch_size: int = 100, + chunks_per_batch: int = 100, ) -> list[Task]: if config is None: ontology_config = get_ontology_env_config() @@ -267,8 +267,8 @@ async def get_default_tasks( # TODO: Find out a better way to do this (Boris's "ontology_config": {"ontology_resolver": get_default_ontology_resolver()} } - if chunk_batch_size is None: - chunk_batch_size = 100 + if chunks_per_batch is None: + chunks_per_batch = 100 default_tasks = [ Task(classify_documents), @@ -283,20 +283,20 @@ async def get_default_tasks( # TODO: Find out a better way to do this (Boris's graph_model=graph_model, config=config, custom_prompt=custom_prompt, - task_config={"batch_size": chunk_batch_size}, + task_config={"batch_size": chunks_per_batch}, ), # Generate knowledge graphs from the document chunks. Task( summarize_text, - task_config={"batch_size": chunk_batch_size}, + task_config={"batch_size": chunks_per_batch}, ), - Task(add_data_points, task_config={"batch_size": chunk_batch_size}), + Task(add_data_points, task_config={"batch_size": chunks_per_batch}), ] return default_tasks async def get_temporal_tasks( - user: User = None, chunker=TextChunker, chunk_size: int = None, chunk_batch_size: int = 10 + user: User = None, chunker=TextChunker, chunk_size: int = None, chunks_per_batch: int = 10 ) -> list[Task]: """ Builds and returns a list of temporal processing tasks to be executed in sequence. @@ -313,13 +313,13 @@ async def get_temporal_tasks( user (User, optional): The user requesting task execution, used for permission checks. chunker (Callable, optional): A text chunking function/class to split documents. Defaults to TextChunker. chunk_size (int, optional): Maximum token size per chunk. If not provided, uses system default. - chunk_batch_size (int, optional): Number of chunks to process in a single batch in Cognify + chunks_per_batch (int, optional): Number of chunks to process in a single batch in Cognify Returns: list[Task]: A list of Task objects representing the temporal processing pipeline. """ - if chunk_batch_size is None: - chunk_batch_size = 10 + if chunks_per_batch is None: + chunks_per_batch = 10 temporal_tasks = [ Task(classify_documents), @@ -329,9 +329,9 @@ async def get_temporal_tasks( max_chunk_size=chunk_size or get_max_chunk_tokens(), chunker=chunker, ), - Task(extract_events_and_timestamps, task_config={"batch_size": chunk_batch_size}), + Task(extract_events_and_timestamps, task_config={"batch_size": chunks_per_batch}), Task(extract_knowledge_graph_from_events), - Task(add_data_points, task_config={"batch_size": chunk_batch_size}), + Task(add_data_points, task_config={"batch_size": chunks_per_batch}), ] return temporal_tasks From 2e1bfe78b1d63b2b089235d2cc7a7742a208d3f5 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Wed, 15 Oct 2025 20:26:59 +0200 Subject: [PATCH 37/61] refactor: rename variable to be more understandable --- cognee/api/v1/add/add.py | 4 ++-- cognee/api/v1/cognify/cognify.py | 4 ++-- cognee/modules/pipelines/operations/pipeline.py | 8 ++++---- cognee/modules/pipelines/operations/run_tasks.py | 6 +++--- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/cognee/api/v1/add/add.py b/cognee/api/v1/add/add.py index b5a8a230f..0f14683f9 100644 --- a/cognee/api/v1/add/add.py +++ b/cognee/api/v1/add/add.py @@ -41,7 +41,7 @@ async def add( extraction_rules: Optional[Dict[str, Any]] = None, tavily_config: Optional[BaseModel] = None, soup_crawler_config: Optional[BaseModel] = None, - data_batch_size: Optional[int] = 20, + data_per_batch: Optional[int] = 20, ): """ Add data to Cognee for knowledge graph processing. @@ -236,7 +236,7 @@ async def add( vector_db_config=vector_db_config, graph_db_config=graph_db_config, incremental_loading=incremental_loading, - data_batch_size=data_batch_size, + data_per_batch=data_per_batch, ): pipeline_run_info = run_info diff --git a/cognee/api/v1/cognify/cognify.py b/cognee/api/v1/cognify/cognify.py index ab5e4a023..1eb266765 100644 --- a/cognee/api/v1/cognify/cognify.py +++ b/cognee/api/v1/cognify/cognify.py @@ -51,7 +51,7 @@ async def cognify( incremental_loading: bool = True, custom_prompt: Optional[str] = None, temporal_cognify: bool = False, - data_batch_size: int = 20, + data_per_batch: int = 20, ): """ Transform ingested data into a structured knowledge graph. @@ -229,7 +229,7 @@ async def cognify( graph_db_config=graph_db_config, incremental_loading=incremental_loading, pipeline_name="cognify_pipeline", - data_batch_size=data_batch_size, + data_per_batch=data_per_batch, ) diff --git a/cognee/modules/pipelines/operations/pipeline.py b/cognee/modules/pipelines/operations/pipeline.py index 9d61235c1..e15e9e505 100644 --- a/cognee/modules/pipelines/operations/pipeline.py +++ b/cognee/modules/pipelines/operations/pipeline.py @@ -35,7 +35,7 @@ async def run_pipeline( vector_db_config: dict = None, graph_db_config: dict = None, incremental_loading: bool = False, - data_batch_size: int = 20, + data_per_batch: int = 20, ): validate_pipeline_tasks(tasks) await setup_and_check_environment(vector_db_config, graph_db_config) @@ -51,7 +51,7 @@ async def run_pipeline( pipeline_name=pipeline_name, context={"dataset": dataset}, incremental_loading=incremental_loading, - data_batch_size=data_batch_size, + data_per_batch=data_per_batch, ): yield run_info @@ -64,7 +64,7 @@ async def run_pipeline_per_dataset( pipeline_name: str = "custom_pipeline", context: dict = None, incremental_loading=False, - data_batch_size: int = 20, + data_per_batch: int = 20, ): # Will only be used if ENABLE_BACKEND_ACCESS_CONTROL is set to True await set_database_global_context_variables(dataset.id, dataset.owner_id) @@ -80,7 +80,7 @@ async def run_pipeline_per_dataset( return pipeline_run = run_tasks( - tasks, dataset.id, data, user, pipeline_name, context, incremental_loading, data_batch_size + tasks, dataset.id, data, user, pipeline_name, context, incremental_loading, data_per_batch ) async for pipeline_run_info in pipeline_run: diff --git a/cognee/modules/pipelines/operations/run_tasks.py b/cognee/modules/pipelines/operations/run_tasks.py index 18eaf8011..ecc2f647b 100644 --- a/cognee/modules/pipelines/operations/run_tasks.py +++ b/cognee/modules/pipelines/operations/run_tasks.py @@ -59,7 +59,7 @@ async def run_tasks( pipeline_name: str = "unknown_pipeline", context: dict = None, incremental_loading: bool = False, - data_batch_size: int = 20, + data_per_batch: int = 20, ): if not user: user = await get_default_user() @@ -91,8 +91,8 @@ async def run_tasks( # Create and gather batches of async tasks of data items that will run the pipeline for the data item results = [] - for start in range(0, len(data), data_batch_size): - data_batch = data[start : start + data_batch_size] + for start in range(0, len(data), data_per_batch): + data_batch = data[start : start + data_per_batch] data_item_tasks = [ asyncio.create_task( From 88cc7af4d7b41b764ddb9db8517ddd56d04677a8 Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Thu, 16 Oct 2025 10:50:50 +0200 Subject: [PATCH 38/61] test: Add a few more examples to the workflow. --- .github/workflows/examples_tests.yml | 53 ++++++++++++++++++++-------- 1 file changed, 38 insertions(+), 15 deletions(-) diff --git a/.github/workflows/examples_tests.yml b/.github/workflows/examples_tests.yml index 406420351..df007a576 100644 --- a/.github/workflows/examples_tests.yml +++ b/.github/workflows/examples_tests.yml @@ -85,8 +85,8 @@ jobs: run: uv run python ./cognee/tests/tasks/descriptive_metrics/neo4j_metrics_test.py - test-multiple-examples: - name: Run Multiple Example Scripts + test-dynamic-steps-metrics: + name: Run Dynamic Steps Example runs-on: ubuntu-22.04 steps: - name: Check out repository @@ -97,7 +97,7 @@ jobs: with: python-version: '3.11.x' - - name: Run Dynamic Steps Example + - name: Run Dynamic Steps Tests env: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} LLM_MODEL: ${{ secrets.LLM_MODEL }} @@ -110,6 +110,18 @@ jobs: EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} run: uv run python ./examples/python/dynamic_steps_example.py + test-temporal-example: + name: Run Temporal Tests + runs-on: ubuntu-22.04 + steps: + - name: Check out repository + uses: actions/checkout@v4 + + - name: Cognee Setup + uses: ./.github/actions/cognee_setup + with: + python-version: '3.11.x' + - name: Run Temporal Example env: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} @@ -123,6 +135,18 @@ jobs: EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} run: uv run python ./examples/python/temporal_example.py + test-ontology-example: + name: Run Ontology Tests + runs-on: ubuntu-22.04 + steps: + - name: Check out repository + uses: actions/checkout@v4 + + - name: Cognee Setup + uses: ./.github/actions/cognee_setup + with: + python-version: '3.11.x' + - name: Run Ontology Demo Example env: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} @@ -136,18 +160,17 @@ jobs: EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} run: uv run python ./examples/python/ontology_demo_example.py - - name: Run Temporal Example - env: - OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} - LLM_MODEL: ${{ secrets.LLM_MODEL }} - LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} - LLM_API_KEY: ${{ secrets.LLM_API_KEY }} - LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} - EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} - EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} - EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} - EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} - run: uv run python ./examples/python/temporal_example.py + test-agentic-reasoning: + name: Run Agentic Reasoning Tests + runs-on: ubuntu-22.04 + steps: + - name: Check out repository + uses: actions/checkout@v4 + + - name: Cognee Setup + uses: ./.github/actions/cognee_setup + with: + python-version: '3.11.x' - name: Run Agentic Reasoning Example env: From 9821a01a478aab77d08470b9e5a87a7a23e6c750 Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Thu, 16 Oct 2025 15:48:20 +0200 Subject: [PATCH 39/61] feat: Redis lock integration and Kuzu agentic access fix (#1504) ## Description This PR introduces a shared locked mechanism in KuzuAdapter to avoid use case when multiple subprocesses from different environments are trying to use the same Kuzu adatabase. ## Type of Change - [ ] Bug fix (non-breaking change that fixes an issue) - [x] New feature (non-breaking change that adds functionality) - [ ] Breaking change (fix or feature that would cause existing functionality to change) - [ ] Documentation update - [ ] Code refactoring - [x] Performance improvement - [ ] Other (please specify): ## Screenshots/Videos (if applicable) None ## Pre-submission Checklist - [x] **I have tested my changes thoroughly before submitting this PR** - [x] **This PR contains minimal changes necessary to address the issue/feature** - [x] My code follows the project's coding standards and style guidelines - [x] I have added tests that prove my fix is effective or that my feature works - [x] I have added necessary documentation (if applicable) - [x] All new and existing tests pass - [x] I have searched existing PRs to ensure this change hasn't been submitted already - [x] I have linked any relevant issues in the description - [x] My commits have clear and descriptive messages ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin. --- .github/actions/cognee_setup/action.yml | 2 +- .github/workflows/e2e_tests.yml | 67 + .../databases/cache/__init__.py | 2 + .../databases/cache/cache_db_interface.py | 42 + .../infrastructure/databases/cache/config.py | 39 + .../databases/cache/get_cache_engine.py | 59 + .../databases/cache/redis/RedisAdapter.py | 49 + .../databases/graph/kuzu/adapter.py | 100 +- cognee/tests/subprocesses/reader.py | 25 + cognee/tests/subprocesses/simple_cognify_1.py | 31 + cognee/tests/subprocesses/simple_cognify_2.py | 31 + cognee/tests/subprocesses/writer.py | 32 + .../test_concurrent_subprocess_access.py | 84 ++ .../databases/cache/test_cache_config.py | 87 ++ docker-compose.yml | 25 + poetry.lock | 1316 +++++++++++++---- pyproject.toml | 1 + uv.lock | 19 +- .../run_subprocess_test.py | 31 + 19 files changed, 1671 insertions(+), 371 deletions(-) create mode 100644 cognee/infrastructure/databases/cache/__init__.py create mode 100644 cognee/infrastructure/databases/cache/cache_db_interface.py create mode 100644 cognee/infrastructure/databases/cache/config.py create mode 100644 cognee/infrastructure/databases/cache/get_cache_engine.py create mode 100644 cognee/infrastructure/databases/cache/redis/RedisAdapter.py create mode 100644 cognee/tests/subprocesses/reader.py create mode 100644 cognee/tests/subprocesses/simple_cognify_1.py create mode 100644 cognee/tests/subprocesses/simple_cognify_2.py create mode 100644 cognee/tests/subprocesses/writer.py create mode 100644 cognee/tests/test_concurrent_subprocess_access.py create mode 100644 cognee/tests/unit/infrastructure/databases/cache/test_cache_config.py create mode 100644 working_dir_error_replication/run_subprocess_test.py diff --git a/.github/actions/cognee_setup/action.yml b/.github/actions/cognee_setup/action.yml index 1326f2d81..4017d524b 100644 --- a/.github/actions/cognee_setup/action.yml +++ b/.github/actions/cognee_setup/action.yml @@ -41,4 +41,4 @@ runs: EXTRA_ARGS="$EXTRA_ARGS --extra $extra" done fi - uv sync --extra api --extra docs --extra evals --extra codegraph --extra ollama --extra dev --extra neo4j $EXTRA_ARGS + uv sync --extra api --extra docs --extra evals --extra codegraph --extra ollama --extra dev --extra neo4j --extra redis $EXTRA_ARGS diff --git a/.github/workflows/e2e_tests.yml b/.github/workflows/e2e_tests.yml index 3fe7a7992..9582a3f3b 100644 --- a/.github/workflows/e2e_tests.yml +++ b/.github/workflows/e2e_tests.yml @@ -1,4 +1,6 @@ name: Reusable Integration Tests +permissions: + contents: read on: workflow_call: @@ -264,3 +266,68 @@ jobs: EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} run: uv run python ./cognee/tests/test_edge_ingestion.py + + + + run_concurrent_subprocess_access_test: + name: Concurrent Subprocess access test + runs-on: ubuntu-latest + defaults: + run: + shell: bash + services: + postgres: + image: pgvector/pgvector:pg17 + env: + POSTGRES_USER: cognee + POSTGRES_PASSWORD: cognee + POSTGRES_DB: cognee_db + options: >- + --health-cmd pg_isready + --health-interval 10s + --health-timeout 5s + --health-retries 5 + ports: + - 5432:5432 + + redis: + image: redis:7 + ports: + - 6379:6379 + options: >- + --health-cmd "redis-cli ping" + --health-interval 5s + --health-timeout 3s + --health-retries 5 + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Cognee Setup + uses: ./.github/actions/cognee_setup + with: + python-version: '3.11.x' + extra-dependencies: "postgres redis" + + - name: Run Concurrent subprocess access test (Kuzu/Lancedb/Postgres) + env: + ENV: dev + LLM_MODEL: ${{ secrets.LLM_MODEL }} + LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} + LLM_API_KEY: ${{ secrets.LLM_API_KEY }} + LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} + EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} + EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} + EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} + EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} + GRAPH_DATABASE_PROVIDER: 'kuzu' + CACHING: true + SHARED_KUZU_LOCK: true + DB_PROVIDER: 'postgres' + DB_NAME: 'cognee_db' + DB_HOST: '127.0.0.1' + DB_PORT: 5432 + DB_USERNAME: cognee + DB_PASSWORD: cognee + run: uv run python ./cognee/tests/test_concurrent_subprocess_access.py \ No newline at end of file diff --git a/cognee/infrastructure/databases/cache/__init__.py b/cognee/infrastructure/databases/cache/__init__.py new file mode 100644 index 000000000..d96c77658 --- /dev/null +++ b/cognee/infrastructure/databases/cache/__init__.py @@ -0,0 +1,2 @@ +from .get_cache_engine import get_cache_engine +from .config import get_cache_config diff --git a/cognee/infrastructure/databases/cache/cache_db_interface.py b/cognee/infrastructure/databases/cache/cache_db_interface.py new file mode 100644 index 000000000..0c0b578f8 --- /dev/null +++ b/cognee/infrastructure/databases/cache/cache_db_interface.py @@ -0,0 +1,42 @@ +from abc import ABC, abstractmethod +from contextlib import contextmanager + + +class CacheDBInterface(ABC): + """ + Abstract base class for distributed cache coordination systems (e.g., Redis, Memcached). + Provides a common interface for lock acquisition, release, and context-managed locking. + """ + + def __init__(self, host: str, port: int, lock_key: str): + self.host = host + self.port = port + self.lock_key = lock_key + self.lock = None + + @abstractmethod + def acquire_lock(self): + """ + Acquire a lock on the given key. + Must be implemented by subclasses. + """ + pass + + @abstractmethod + def release_lock(self): + """ + Release the lock if it is held. + Must be implemented by subclasses. + """ + pass + + @contextmanager + def hold_lock(self): + """ + Context manager for safely acquiring and releasing the lock. + """ + self.acquire() + try: + yield + finally: + self.release() diff --git a/cognee/infrastructure/databases/cache/config.py b/cognee/infrastructure/databases/cache/config.py new file mode 100644 index 000000000..b399e0259 --- /dev/null +++ b/cognee/infrastructure/databases/cache/config.py @@ -0,0 +1,39 @@ +from pydantic_settings import BaseSettings, SettingsConfigDict +from functools import lru_cache + + +class CacheConfig(BaseSettings): + """ + Configuration for distributed cache systems (e.g., Redis), used for locking or coordination. + + Attributes: + - shared_kuzu_lock: Shared kuzu lock logic on/off. + - cache_host: Hostname of the cache service. + - cache_port: Port number for the cache service. + - agentic_lock_expire: Automatic lock expiration time (in seconds). + - agentic_lock_timeout: Maximum time (in seconds) to wait for the lock release. + """ + + caching: bool = False + shared_kuzu_lock: bool = False + cache_host: str = "localhost" + cache_port: int = 6379 + agentic_lock_expire: int = 240 + agentic_lock_timeout: int = 300 + + model_config = SettingsConfigDict(env_file=".env", extra="allow") + + def to_dict(self) -> dict: + return { + "caching": self.caching, + "shared_kuzu_lock": self.shared_kuzu_lock, + "cache_host": self.cache_host, + "cache_port": self.cache_port, + "agentic_lock_expire": self.agentic_lock_expire, + "agentic_lock_timeout": self.agentic_lock_timeout, + } + + +@lru_cache +def get_cache_config(): + return CacheConfig() diff --git a/cognee/infrastructure/databases/cache/get_cache_engine.py b/cognee/infrastructure/databases/cache/get_cache_engine.py new file mode 100644 index 000000000..92186f877 --- /dev/null +++ b/cognee/infrastructure/databases/cache/get_cache_engine.py @@ -0,0 +1,59 @@ +"""Factory to get the appropriate cache coordination engine (e.g., Redis).""" + +from functools import lru_cache +from cognee.infrastructure.databases.cache.config import get_cache_config + +from cognee.infrastructure.databases.cache.cache_db_interface import CacheDBInterface + +config = get_cache_config() + + +@lru_cache +def create_cache_engine( + cache_host: str, + cache_port: int, + lock_key: str, + agentic_lock_expire: int = 240, + agentic_lock_timeout: int = 300, +): + """ + Factory function to instantiate a cache coordination backend (currently Redis). + + Parameters: + ----------- + - cache_host: Hostname or IP of the cache server. + - cache_port: Port number to connect to. + - lock_key: Identifier used for the locking resource. + - agentic_lock_expire: Duration to hold the lock after acquisition. + - agentic_lock_timeout: Max time to wait for the lock before failing. + + Returns: + -------- + - CacheDBInterface: An instance of the appropriate cache adapter. :TODO: Now we support only Redis. later if we add more here we can split the logic + """ + if config.caching: + from cognee.infrastructure.databases.cache.redis.RedisAdapter import RedisAdapter + + return RedisAdapter( + host=cache_host, + port=cache_port, + lock_name=lock_key, + timeout=agentic_lock_expire, + blocking_timeout=agentic_lock_timeout, + ) + else: + return None + + +def get_cache_engine(lock_key: str) -> CacheDBInterface: + """ + Returns a cache adapter instance using current context configuration. + """ + + return create_cache_engine( + cache_host=config.cache_host, + cache_port=config.cache_port, + lock_key=lock_key, + agentic_lock_expire=config.agentic_lock_expire, + agentic_lock_timeout=config.agentic_lock_timeout, + ) diff --git a/cognee/infrastructure/databases/cache/redis/RedisAdapter.py b/cognee/infrastructure/databases/cache/redis/RedisAdapter.py new file mode 100644 index 000000000..70c8de9bb --- /dev/null +++ b/cognee/infrastructure/databases/cache/redis/RedisAdapter.py @@ -0,0 +1,49 @@ +import redis +from contextlib import contextmanager +from cognee.infrastructure.databases.cache.cache_db_interface import CacheDBInterface + + +class RedisAdapter(CacheDBInterface): + def __init__(self, host, port, lock_name, timeout=240, blocking_timeout=300): + super().__init__(host, port, lock_name) + self.redis = redis.Redis(host=host, port=port) + self.timeout = timeout + self.blocking_timeout = blocking_timeout + + def acquire_lock(self): + """ + Acquire the Redis lock manually. Raises if acquisition fails. + """ + self.lock = self.redis.lock( + name=self.lock_key, + timeout=self.timeout, + blocking_timeout=self.blocking_timeout, + ) + + acquired = self.lock.acquire() + if not acquired: + raise RuntimeError(f"Could not acquire Redis lock: {self.lock_key}") + + return self.lock + + def release_lock(self): + """ + Release the Redis lock manually, if held. + """ + if self.lock: + try: + self.lock.release() + self.lock = None + except redis.exceptions.LockError: + pass + + @contextmanager + def hold_lock(self): + """ + Context manager for acquiring and releasing the Redis lock automatically. + """ + self.acquire() + try: + yield + finally: + self.release() diff --git a/cognee/infrastructure/databases/graph/kuzu/adapter.py b/cognee/infrastructure/databases/graph/kuzu/adapter.py index 7b772097f..3f0fb0c57 100644 --- a/cognee/infrastructure/databases/graph/kuzu/adapter.py +++ b/cognee/infrastructure/databases/graph/kuzu/adapter.py @@ -4,7 +4,7 @@ import os import json import asyncio import tempfile -from uuid import UUID +from uuid import UUID, uuid5, NAMESPACE_OID from kuzu import Connection from kuzu.database import Database from datetime import datetime, timezone @@ -23,9 +23,14 @@ from cognee.infrastructure.engine import DataPoint from cognee.modules.storage.utils import JSONEncoder from cognee.modules.engine.utils.generate_timestamp_datapoint import date_to_int from cognee.tasks.temporal_graph.models import Timestamp +from cognee.infrastructure.databases.cache.config import get_cache_config logger = get_logger() +cache_config = get_cache_config() +if cache_config.shared_kuzu_lock: + from cognee.infrastructure.databases.cache.get_cache_engine import get_cache_engine + class KuzuAdapter(GraphDBInterface): """ @@ -39,12 +44,20 @@ class KuzuAdapter(GraphDBInterface): def __init__(self, db_path: str): """Initialize Kuzu database connection and schema.""" + self.open_connections = 0 + self._is_closed = False self.db_path = db_path # Path for the database directory self.db: Optional[Database] = None self.connection: Optional[Connection] = None - self.executor = ThreadPoolExecutor() - self._initialize_connection() + if cache_config.shared_kuzu_lock: + self.redis_lock = get_cache_engine( + lock_key="kuzu-lock-" + str(uuid5(NAMESPACE_OID, db_path)) + ) + else: + self.executor = ThreadPoolExecutor() + self._initialize_connection() self.KUZU_ASYNC_LOCK = asyncio.Lock() + self._connection_change_lock = asyncio.Lock() def _initialize_connection(self) -> None: """Initialize the Kuzu database connection and schema.""" @@ -209,9 +222,13 @@ class KuzuAdapter(GraphDBInterface): params = params or {} def blocking_query(): + lock_acquired = False try: + if cache_config.shared_kuzu_lock: + self.redis_lock.acquire_lock() + lock_acquired = True if not self.connection: - logger.debug("Reconnecting to Kuzu database...") + logger.info("Reconnecting to Kuzu database...") self._initialize_connection() result = self.connection.execute(query, params) @@ -225,12 +242,47 @@ class KuzuAdapter(GraphDBInterface): val = val.as_py() processed_rows.append(val) rows.append(tuple(processed_rows)) + return rows except Exception as e: logger.error(f"Query execution failed: {str(e)}") raise + finally: + if cache_config.shared_kuzu_lock and lock_acquired: + try: + self.close() + finally: + self.redis_lock.release_lock() - return await loop.run_in_executor(self.executor, blocking_query) + if cache_config.shared_kuzu_lock: + async with self._connection_change_lock: + self.open_connections += 1 + logger.info(f"Open connections after open: {self.open_connections}") + try: + result = blocking_query() + finally: + self.open_connections -= 1 + logger.info(f"Open connections after close: {self.open_connections}") + return result + else: + result = await loop.run_in_executor(self.executor, blocking_query) + return result + + def close(self): + if self.connection: + del self.connection + self.connection = None + if self.db: + del self.db + self.db = None + self._is_closed = True + logger.info("Kuzu database closed successfully") + + def reopen(self): + if self._is_closed: + self._is_closed = False + self._initialize_connection() + logger.info("Kuzu database re-opened successfully") @asynccontextmanager async def get_session(self): @@ -1557,44 +1609,6 @@ class KuzuAdapter(GraphDBInterface): logger.error(f"Failed to delete graph data: {e}") raise - async def clear_database(self) -> None: - """ - Clear all data from the database by deleting the database files and reinitializing. - - This method removes all files associated with the database and reinitializes the Kuzu - database structure, ensuring a completely empty state. It handles exceptions that might - occur during file deletions or initializations carefully. - """ - try: - if self.connection: - self.connection = None - if self.db: - self.db.close() - self.db = None - - db_dir = os.path.dirname(self.db_path) - db_name = os.path.basename(self.db_path) - file_storage = get_file_storage(db_dir) - - if await file_storage.file_exists(db_name): - await file_storage.remove_all() - logger.info(f"Deleted Kuzu database files at {self.db_path}") - - # Reinitialize the database - self._initialize_connection() - # Verify the database is empty - result = self.connection.execute("MATCH (n:Node) RETURN COUNT(n)") - count = result.get_next()[0] if result.has_next() else 0 - if count > 0: - logger.warning( - f"Database still contains {count} nodes after clearing, forcing deletion" - ) - self.connection.execute("MATCH (n:Node) DETACH DELETE n") - logger.info("Database cleared successfully") - except Exception as e: - logger.error(f"Error during database clearing: {e}") - raise - async def get_document_subgraph(self, data_id: str): """ Get all nodes that should be deleted when removing a document. diff --git a/cognee/tests/subprocesses/reader.py b/cognee/tests/subprocesses/reader.py new file mode 100644 index 000000000..df54a63e4 --- /dev/null +++ b/cognee/tests/subprocesses/reader.py @@ -0,0 +1,25 @@ +import asyncio +import time +from cognee.infrastructure.databases.graph.kuzu.adapter import KuzuAdapter + +# This will create the test.db if it doesn't exist + + +async def main(): + adapter = KuzuAdapter("test.db") + result = await adapter.query("MATCH (n:Node) RETURN COUNT(n)") + print(f"Reader: Found {result[0][0]} nodes") + result = await adapter.query("MATCH (n:Node) RETURN COUNT(n)") + print(f"Reader: Found {result[0][0]} nodes") + result = await adapter.query("MATCH (n:Node) RETURN COUNT(n)") + print(f"Reader: Found {result[0][0]} nodes") + result = await adapter.query("MATCH (n:Node) RETURN COUNT(n)") + print(f"Reader: Found {result[0][0]} nodes") + result = await adapter.query("MATCH (n:Node) RETURN COUNT(n)") + print(f"Reader: Found {result} nodes") + result = await adapter.query("MATCH (n:Node) RETURN COUNT(n)") + print(f"Reader: Found {result[0][0]} nodes") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/cognee/tests/subprocesses/simple_cognify_1.py b/cognee/tests/subprocesses/simple_cognify_1.py new file mode 100644 index 000000000..cf4d65c88 --- /dev/null +++ b/cognee/tests/subprocesses/simple_cognify_1.py @@ -0,0 +1,31 @@ +import asyncio +import cognee +from cognee.shared.logging_utils import setup_logging, INFO +from cognee.api.v1.search import SearchType + + +async def main(): + await cognee.cognify(datasets=["first_cognify_dataset"]) + + query_text = ( + "Tell me what is in the context. Additionally write out 'FIRST_COGNIFY' before your answer" + ) + search_results = await cognee.search( + query_type=SearchType.GRAPH_COMPLETION, + query_text=query_text, + datasets=["first_cognify_dataset"], + ) + + print("Search results:") + for result_text in search_results: + print(result_text) + + +if __name__ == "__main__": + logger = setup_logging(log_level=INFO) + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + try: + loop.run_until_complete(main()) + finally: + loop.run_until_complete(loop.shutdown_asyncgens()) diff --git a/cognee/tests/subprocesses/simple_cognify_2.py b/cognee/tests/subprocesses/simple_cognify_2.py new file mode 100644 index 000000000..6de5035ec --- /dev/null +++ b/cognee/tests/subprocesses/simple_cognify_2.py @@ -0,0 +1,31 @@ +import asyncio +import cognee +from cognee.shared.logging_utils import setup_logging, INFO +from cognee.api.v1.search import SearchType + + +async def main(): + await cognee.cognify(datasets=["second_cognify_dataset"]) + + query_text = ( + "Tell me what is in the context. Additionally write out 'SECOND_COGNIFY' before your answer" + ) + search_results = await cognee.search( + query_type=SearchType.GRAPH_COMPLETION, + query_text=query_text, + datasets=["second_cognify_dataset"], + ) + + print("Search results:") + for result_text in search_results: + print(result_text) + + +if __name__ == "__main__": + logger = setup_logging(log_level=INFO) + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + try: + loop.run_until_complete(main()) + finally: + loop.run_until_complete(loop.shutdown_asyncgens()) diff --git a/cognee/tests/subprocesses/writer.py b/cognee/tests/subprocesses/writer.py new file mode 100644 index 000000000..27d00caba --- /dev/null +++ b/cognee/tests/subprocesses/writer.py @@ -0,0 +1,32 @@ +import asyncio +import time +import uuid +from cognee.modules.data.processing.document_types import PdfDocument +from cognee.infrastructure.databases.graph.kuzu.adapter import KuzuAdapter + + +def create_node(name): + document = PdfDocument( + id=uuid.uuid4(), + name=name, + raw_data_location=name, + external_metadata="test_external_metadata", + mime_type="test_mime", + ) + return document + + +async def main(): + adapter = KuzuAdapter("test.db") + nodes = [create_node(f"Node{i}") for i in range(5)] + + print("Writer: Starting...") + await adapter.add_nodes(nodes) + + print("writer finished...") + + time.sleep(10) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/cognee/tests/test_concurrent_subprocess_access.py b/cognee/tests/test_concurrent_subprocess_access.py new file mode 100644 index 000000000..de03ed254 --- /dev/null +++ b/cognee/tests/test_concurrent_subprocess_access.py @@ -0,0 +1,84 @@ +import os +import asyncio +import cognee +import pathlib +import subprocess + +from cognee.shared.logging_utils import get_logger + +logger = get_logger() + +""" +Test: Redis-based Kùzu Locking Across Subprocesses + +This test ensures the Redis shared lock correctly serializes access to the Kùzu +database when multiple subprocesses (writer/reader and cognify tasks) run in parallel. +If this test fails, it indicates the locking mechanism is not properly handling +concurrent subprocess access. +""" + + +async def concurrent_subprocess_access(): + data_directory_path = str( + pathlib.Path( + os.path.join(pathlib.Path(__file__).parent, ".data_storage/concurrent_tasks") + ).resolve() + ) + cognee_directory_path = str( + pathlib.Path( + os.path.join(pathlib.Path(__file__).parent, ".cognee_system/concurrent_tasks") + ).resolve() + ) + + subprocess_directory_path = str( + pathlib.Path(os.path.join(pathlib.Path(__file__).parent, "subprocesses/")).resolve() + ) + + writer_path = subprocess_directory_path + "/writer.py" + reader_path = subprocess_directory_path + "/reader.py" + + cognee.config.data_root_directory(data_directory_path) + cognee.config.system_root_directory(cognee_directory_path) + + await cognee.prune.prune_data() + await cognee.prune.prune_system(metadata=True) + + writer_process = subprocess.Popen([os.sys.executable, str(writer_path)]) + + reader_process = subprocess.Popen([os.sys.executable, str(reader_path)]) + + # Wait for both processes to complete + writer_process.wait() + reader_process.wait() + + logger.info("Basic write read subprocess example finished") + + await cognee.prune.prune_data() + await cognee.prune.prune_system(metadata=True) + + text = """ + This is the text of the first cognify subprocess + """ + await cognee.add(text, dataset_name="first_cognify_dataset") + + text = """ + This is the text of the second cognify subprocess + """ + await cognee.add(text, dataset_name="second_cognify_dataset") + + first_cognify_path = subprocess_directory_path + "/simple_cognify_1.py" + second_cognify_path = subprocess_directory_path + "/simple_cognify_2.py" + + first_cognify_process = subprocess.Popen([os.sys.executable, str(first_cognify_path)]) + + second_cognify_process = subprocess.Popen([os.sys.executable, str(second_cognify_path)]) + + # Wait for both processes to complete + first_cognify_process.wait() + second_cognify_process.wait() + + logger.info("Database concurrent subprocess example finished") + + +if __name__ == "__main__": + asyncio.run(concurrent_subprocess_access()) diff --git a/cognee/tests/unit/infrastructure/databases/cache/test_cache_config.py b/cognee/tests/unit/infrastructure/databases/cache/test_cache_config.py new file mode 100644 index 000000000..cb5086a70 --- /dev/null +++ b/cognee/tests/unit/infrastructure/databases/cache/test_cache_config.py @@ -0,0 +1,87 @@ +"""Tests for cache configuration.""" + +import pytest +from cognee.infrastructure.databases.cache.config import CacheConfig, get_cache_config + + +def test_cache_config_defaults(): + """Test that CacheConfig has the correct default values.""" + config = CacheConfig() + + assert config.caching is False + assert config.shared_kuzu_lock is False + assert config.cache_host == "localhost" + assert config.cache_port == 6379 + assert config.agentic_lock_expire == 240 + assert config.agentic_lock_timeout == 300 + + +def test_cache_config_custom_values(): + """Test that CacheConfig accepts custom values.""" + config = CacheConfig( + caching=True, + shared_kuzu_lock=True, + cache_host="redis.example.com", + cache_port=6380, + agentic_lock_expire=120, + agentic_lock_timeout=180, + ) + + assert config.caching is True + assert config.shared_kuzu_lock is True + assert config.cache_host == "redis.example.com" + assert config.cache_port == 6380 + assert config.agentic_lock_expire == 120 + assert config.agentic_lock_timeout == 180 + + +def test_cache_config_to_dict(): + """Test the to_dict method returns all configuration values.""" + config = CacheConfig( + caching=True, + shared_kuzu_lock=True, + cache_host="test-host", + cache_port=7000, + agentic_lock_expire=100, + agentic_lock_timeout=200, + ) + + config_dict = config.to_dict() + + assert config_dict == { + "caching": True, + "shared_kuzu_lock": True, + "cache_host": "test-host", + "cache_port": 7000, + "agentic_lock_expire": 100, + "agentic_lock_timeout": 200, + } + + +def test_get_cache_config_singleton(): + """Test that get_cache_config returns the same instance.""" + config1 = get_cache_config() + config2 = get_cache_config() + + assert config1 is config2 + + +def test_cache_config_extra_fields_allowed(): + """Test that CacheConfig allows extra fields due to extra='allow'.""" + config = CacheConfig(extra_field="extra_value", another_field=123) + + assert hasattr(config, "extra_field") + assert config.extra_field == "extra_value" + assert hasattr(config, "another_field") + assert config.another_field == 123 + + +def test_cache_config_boolean_type_validation(): + """Test that boolean fields accept various truthy/falsy values.""" + config1 = CacheConfig(caching="true", shared_kuzu_lock="yes") + assert config1.caching is True + assert config1.shared_kuzu_lock is True + + config2 = CacheConfig(caching="false", shared_kuzu_lock="no") + assert config2.caching is False + assert config2.shared_kuzu_lock is False diff --git a/docker-compose.yml b/docker-compose.yml index 9f0e199a0..43d9b2607 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -129,6 +129,30 @@ services: networks: - cognee-network + redis: + image: redis:7-alpine + container_name: redis + profiles: + - redis + ports: + - "6379:6379" + networks: + - cognee-network + volumes: + - redis_data:/data + command: [ "redis-server", "--appendonly", "yes" ] + + + redisinsight: + image: redislabs/redisinsight:latest + container_name: redisinsight + restart: always + ports: + - "5540:5540" + networks: + - cognee-network + + networks: cognee-network: name: cognee-network @@ -136,3 +160,4 @@ networks: volumes: chromadb_data: postgres_data: + redis_data: diff --git a/poetry.lock b/poetry.lock index 120de4f7f..62ae7be8d 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 2.1.2 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.1.3 and should not be changed by hand. [[package]] name = "accelerate" @@ -7,7 +7,7 @@ description = "Accelerate" optional = true python-versions = ">=3.9.0" groups = ["main"] -markers = "extra == \"docs\"" +markers = "extra == \"docs\" or extra == \"docling\"" files = [ {file = "accelerate-1.10.1-py3-none-any.whl", hash = "sha256:3621cff60b9a27ce798857ece05e2b9f56fcc71631cfb31ccf71f0359c311f11"}, {file = "accelerate-1.10.1.tar.gz", hash = "sha256:3dea89e433420e4bfac0369cae7e36dcd6a56adfcfd38cdda145c6225eab5df8"}, @@ -315,7 +315,7 @@ description = "ANTLR 4.9.3 runtime for Python 3.7" optional = true python-versions = "*" groups = ["main"] -markers = "extra == \"docs\"" +markers = "extra == \"docling\" or extra == \"docs\"" files = [ {file = "antlr4-python3-runtime-4.9.3.tar.gz", hash = "sha256:f224469b4168294902bb1efa80a8bf7855f24c99aef99cbefc1bcd3cce77881b"}, ] @@ -354,6 +354,35 @@ files = [ {file = "appnope-0.1.4.tar.gz", hash = "sha256:1de3860566df9caf38f01f86f65e0e13e379af54f9e4bee1e66b48f2efffd1ee"}, ] +[[package]] +name = "apscheduler" +version = "3.11.0" +description = "In-process task scheduler with Cron-like capabilities" +optional = true +python-versions = ">=3.8" +groups = ["main"] +markers = "extra == \"scraping\"" +files = [ + {file = "APScheduler-3.11.0-py3-none-any.whl", hash = "sha256:fc134ca32e50f5eadcc4938e3a4545ab19131435e851abb40b34d63d5141c6da"}, + {file = "apscheduler-3.11.0.tar.gz", hash = "sha256:4c622d250b0955a65d5d0eb91c33e6d43fd879834bf541e0a18661ae60460133"}, +] + +[package.dependencies] +tzlocal = ">=3.0" + +[package.extras] +doc = ["packaging", "sphinx", "sphinx-rtd-theme (>=1.3.0)"] +etcd = ["etcd3", "protobuf (<=3.21.0)"] +gevent = ["gevent"] +mongodb = ["pymongo (>=3.0)"] +redis = ["redis (>=3.0)"] +rethinkdb = ["rethinkdb (>=2.4.0)"] +sqlalchemy = ["sqlalchemy (>=1.4)"] +test = ["APScheduler[etcd,mongodb,redis,rethinkdb,sqlalchemy,tornado,zookeeper]", "PySide6 ; platform_python_implementation == \"CPython\" and python_version < \"3.14\"", "anyio (>=4.5.2)", "gevent ; python_version < \"3.14\"", "pytest", "pytz", "twisted ; python_version < \"3.14\""] +tornado = ["tornado (>=4.3)"] +twisted = ["twisted"] +zookeeper = ["kazoo"] + [[package]] name = "argon2-cffi" version = "23.1.0" @@ -510,7 +539,7 @@ description = "Timeout context manager for asyncio programs" optional = false python-versions = ">=3.8" groups = ["main"] -markers = "extra == \"falkordb\" and python_full_version < \"3.11.3\" or python_version == \"3.10\"" +markers = "extra == \"redis\" and python_full_version < \"3.11.3\" or python_version == \"3.10\"" files = [ {file = "async_timeout-5.0.1-py3-none-any.whl", hash = "sha256:39e3809566ff85354557ec2398b55e096c8364bacac9405a7a1fa429e77fe76c"}, {file = "async_timeout-5.0.1.tar.gz", hash = "sha256:d9321a7a3d5a6a5e187e824d2fa0793ce379a202935782d555d6e9d2735677d3"}, @@ -768,7 +797,7 @@ description = "Screen-scraping library" optional = true python-versions = ">=3.7.0" groups = ["main"] -markers = "extra == \"notebook\" or extra == \"dev\" or extra == \"docs\" or extra == \"evals\"" +markers = "extra == \"scraping\" or extra == \"notebook\" or extra == \"dev\" or extra == \"docs\" or extra == \"evals\" or extra == \"docling\"" files = [ {file = "beautifulsoup4-4.13.5-py3-none-any.whl", hash = "sha256:642085eaa22233aceadff9c69651bc51e8bf3f874fb6d7104ece2beb24b47c4a"}, {file = "beautifulsoup4-4.13.5.tar.gz", hash = "sha256:5e70131382930e7c3de33450a2f54a63d5e4b19386eab43a5b34d594268f3695"}, @@ -878,7 +907,7 @@ description = "Extensible memoizing collections and decorators" optional = true python-versions = ">=3.7" groups = ["main"] -markers = "extra == \"gemini\" or extra == \"docs\" or extra == \"deepeval\" or extra == \"chromadb\"" +markers = "extra == \"deepeval\" or extra == \"chromadb\" or extra == \"docs\"" files = [ {file = "cachetools-5.5.2-py3-none-any.whl", hash = "sha256:d26a22bcc62eb95c3beabd9f1ee5e820d3d2704fe2967cbe350e20c8ffcd3f0a"}, {file = "cachetools-5.5.2.tar.gz", hash = "sha256:1a661caa9175d26759571b2e19580f9d6393969e5dfca11fdb1f947a23e640d4"}, @@ -1203,7 +1232,7 @@ description = "Cross-platform colored terminal text." optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" groups = ["main"] -markers = "(platform_system == \"Windows\" or extra == \"llama-index\" or extra == \"dev\" or extra == \"chromadb\" or sys_platform == \"win32\") and (platform_system == \"Windows\" or extra == \"llama-index\" or extra == \"dev\" or os_name == \"nt\" or sys_platform == \"win32\")" +markers = "(platform_system == \"Windows\" or extra == \"llama-index\" or extra == \"dev\" or extra == \"chromadb\" or sys_platform == \"win32\") and (platform_system == \"Windows\" or os_name == \"nt\" or extra == \"llama-index\" or extra == \"dev\" or sys_platform == \"win32\")" files = [ {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"}, {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, @@ -1227,6 +1256,25 @@ humanfriendly = ">=9.1" [package.extras] cron = ["capturer (>=2.4)"] +[[package]] +name = "colorlog" +version = "6.9.0" +description = "Add colours to the output of Python's logging module." +optional = true +python-versions = ">=3.6" +groups = ["main"] +markers = "extra == \"docling\"" +files = [ + {file = "colorlog-6.9.0-py3-none-any.whl", hash = "sha256:5906e71acd67cb07a71e779c47c4bcb45fb8c2993eebe9e5adcd6a6f1b283eff"}, + {file = "colorlog-6.9.0.tar.gz", hash = "sha256:bfba54a1b93b94f54e1f4fe48395725a3d92fd2a4af702f6bd70946bdc0c6ac2"}, +] + +[package.dependencies] +colorama = {version = "*", markers = "sys_platform == \"win32\""} + +[package.extras] +development = ["black", "flake8", "mypy", "pytest", "types-colorama"] + [[package]] name = "comm" version = "0.2.3" @@ -1862,7 +1910,7 @@ description = "serialize all of Python" optional = true python-versions = ">=3.8" groups = ["main"] -markers = "extra == \"dev\"" +markers = "extra == \"dev\" or extra == \"docling\"" files = [ {file = "dill-0.4.0-py3-none-any.whl", hash = "sha256:44f54bf6412c2c8464c14e8243eb163690a9800dbe2c367330883b19c7561049"}, {file = "dill-0.4.0.tar.gz", hash = "sha256:0633f1d2df477324f53a895b02c901fb961bdbf65a17122586ea7019292cbcf0"}, @@ -2022,6 +2070,167 @@ idna = ["idna (>=3.10)"] trio = ["trio (>=0.30)"] wmi = ["wmi (>=1.5.1) ; platform_system == \"Windows\""] +[[package]] +name = "docling" +version = "2.56.1" +description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications." +optional = true +python-versions = "<4.0,>=3.9" +groups = ["main"] +markers = "extra == \"docling\"" +files = [ + {file = "docling-2.56.1-py3-none-any.whl", hash = "sha256:9c84eb4f5a78cd8a3c88f833157154b4e86c8c853b433e5a8ae811f69081938e"}, + {file = "docling-2.56.1.tar.gz", hash = "sha256:94cdededd8617b3b164ac42ad1c8d73271507b21aecbbaeb16a05b009b0550f5"}, +] + +[package.dependencies] +accelerate = ">=1.0.0,<2" +beautifulsoup4 = ">=4.12.3,<5.0.0" +certifi = ">=2024.7.4" +docling-core = {version = ">=2.48.2,<3.0.0", extras = ["chunking"]} +docling-ibm-models = ">=3.9.1,<4" +docling-parse = ">=4.4.0,<5.0.0" +filetype = ">=1.2.0,<2.0.0" +huggingface_hub = ">=0.23,<1" +lxml = ">=4.0.0,<6.0.0" +marko = ">=2.1.2,<3.0.0" +ocrmac = {version = ">=1.0.0,<2.0.0", markers = "sys_platform == \"darwin\""} +openpyxl = ">=3.1.5,<4.0.0" +pandas = ">=2.1.4,<3.0.0" +pillow = ">=10.0.0,<12.0.0" +pluggy = ">=1.0.0,<2.0.0" +polyfactory = ">=2.22.2" +pydantic = ">=2.0.0,<3.0.0" +pydantic-settings = ">=2.3.0,<3.0.0" +pylatexenc = ">=2.10,<3.0" +pypdfium2 = ">=4.30.0,<4.30.1 || >4.30.1,<5.0.0" +python-docx = ">=1.1.2,<2.0.0" +python-pptx = ">=1.0.2,<2.0.0" +rapidocr = {version = ">=3.3,<4.0.0", markers = "python_version < \"3.14\""} +requests = ">=2.32.2,<3.0.0" +rtree = ">=1.3.0,<2.0.0" +scipy = ">=1.6.0,<2.0.0" +tqdm = ">=4.65.0,<5.0.0" +typer = ">=0.12.5,<0.20.0" + +[package.extras] +asr = ["openai-whisper (>=20250625)"] +easyocr = ["easyocr (>=1.7,<2.0)"] +ocrmac = ["ocrmac (>=1.0.0,<2.0.0) ; sys_platform == \"darwin\""] +rapidocr = ["onnxruntime (>=1.7.0,<2.0.0)", "rapidocr (>=3.3,<4.0.0) ; python_version < \"3.14\""] +tesserocr = ["tesserocr (>=2.7.1,<3.0.0)"] +vlm = ["accelerate (>=1.2.1,<2.0.0)", "mlx-vlm (>=0.3.0,<1.0.0) ; python_version >= \"3.10\" and sys_platform == \"darwin\" and platform_machine == \"arm64\"", "qwen-vl-utils (>=0.0.11)", "transformers (>=4.46.0,<5.0.0)", "vllm (>=0.10.0,<1.0.0) ; python_version >= \"3.10\" and sys_platform == \"linux\" and platform_machine == \"x86_64\""] + +[[package]] +name = "docling-core" +version = "2.48.4" +description = "A python library to define and validate data types in Docling." +optional = true +python-versions = "<4.0,>=3.9" +groups = ["main"] +markers = "extra == \"docling\"" +files = [ + {file = "docling_core-2.48.4-py3-none-any.whl", hash = "sha256:367675c1165d0934ae498fa57ca2d27ef0468aad74dc44a5ab061f5d87882ea1"}, + {file = "docling_core-2.48.4.tar.gz", hash = "sha256:d87ce3021cdae3d073ce7572a2396b69be3cde82ebf9a74d4bad1e1cdfdfd524"}, +] + +[package.dependencies] +jsonref = ">=1.1.0,<2.0.0" +jsonschema = ">=4.16.0,<5.0.0" +latex2mathml = ">=3.77.0,<4.0.0" +pandas = ">=2.1.4,<3.0.0" +pillow = ">=10.0.0,<12.0.0" +pydantic = ">=2.6.0,<2.10.0 || >2.10.0,<2.10.1 || >2.10.1,<2.10.2 || >2.10.2,<3.0.0" +pyyaml = ">=5.1,<7.0.0" +semchunk = {version = ">=2.2.0,<3.0.0", optional = true, markers = "extra == \"chunking\""} +tabulate = ">=0.9.0,<0.10.0" +transformers = {version = ">=4.34.0,<5.0.0", optional = true, markers = "extra == \"chunking\""} +typer = ">=0.12.5,<0.20.0" +typing-extensions = ">=4.12.2,<5.0.0" + +[package.extras] +chunking = ["semchunk (>=2.2.0,<3.0.0)", "transformers (>=4.34.0,<5.0.0)"] +chunking-openai = ["semchunk", "tiktoken (>=0.9.0,<0.10.0)"] + +[[package]] +name = "docling-ibm-models" +version = "3.9.1" +description = "This package contains the AI models used by the Docling PDF conversion package" +optional = true +python-versions = "<4.0,>=3.9" +groups = ["main"] +markers = "extra == \"docling\"" +files = [ + {file = "docling_ibm_models-3.9.1-py3-none-any.whl", hash = "sha256:f2d845703877a3ca8853b57775eb8e88a7a9503d4fa110500a2550b8d63d0098"}, + {file = "docling_ibm_models-3.9.1.tar.gz", hash = "sha256:ac6cd1c2be93437cbb5c1f1a1a4030792a38859a1655b14f25cbc8aec760c351"}, +] + +[package.dependencies] +accelerate = ">=1.2.1,<2.0.0" +docling-core = ">=2.19.0,<3.0.0" +huggingface_hub = ">=0.23,<1" +jsonlines = ">=3.1.0,<4.0.0" +numpy = ">=1.24.4,<3.0.0" +opencv-python-headless = ">=4.6.0.66,<5.0.0.0" +Pillow = ">=10.0.0,<12.0.0" +pydantic = ">=2.0.0,<3.0.0" +rtree = ">=1.0.0" +safetensors = {version = ">=0.4.3,<1", extras = ["torch"]} +torch = ">=2.2.2,<3.0.0" +torchvision = ">=0,<1" +tqdm = ">=4.64.0,<5.0.0" +transformers = ">=4.42.0,<5.0.0" + +[[package]] +name = "docling-parse" +version = "4.5.0" +description = "Simple package to extract text with coordinates from programmatic PDFs" +optional = true +python-versions = ">=3.9" +groups = ["main"] +markers = "extra == \"docling\"" +files = [ + {file = "docling_parse-4.5.0-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:52df1c5bbafe5199c090bf47eb802c2fe40173fb438200f9a7cbe401aa1eed74"}, + {file = "docling_parse-4.5.0-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:99e353ab01ac5c81318b67f42c4fc83ac4a0b5b4783bc566f19656204acf45f0"}, + {file = "docling_parse-4.5.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e9223485df491432f5549dd4566c6649ff32f54370701a004673e27e6fa94a9e"}, + {file = "docling_parse-4.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:41ae6a7f0139d48b9ce8e0a7c43be003e6fa9382919a7efa76153bd1cdbb5e21"}, + {file = "docling_parse-4.5.0-cp310-cp310-win_amd64.whl", hash = "sha256:8beb4f2c79c676b93ab3a14f86586adb51c3d5a2e3c1a902186e4cd6ed0a2e45"}, + {file = "docling_parse-4.5.0-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:f830409eb96b063ae9f3f4e676f760b0d9738bcb0708ba6b840b7e0c84c490bd"}, + {file = "docling_parse-4.5.0-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:0a1a5f3e2f11ea74ab28d9c04b9391fa4b929c4af045c16bfb0da1e377646e54"}, + {file = "docling_parse-4.5.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ee02646e7a158c9f67d8df0052b544f1240d3c28eefa4658603931c13eac4435"}, + {file = "docling_parse-4.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c49193988b56133149584fed70b176de85c95fe698849b2acf68fde9df3a93e5"}, + {file = "docling_parse-4.5.0-cp311-cp311-win_amd64.whl", hash = "sha256:256019969f1edc08b051a90fe739430593aaf7cd59fb18a2e00745f18533ce43"}, + {file = "docling_parse-4.5.0-cp312-cp312-macosx_13_0_x86_64.whl", hash = "sha256:d0ea05741721a76cfca6559d7cac283f2b2953915745b439be0ca8557864bb33"}, + {file = "docling_parse-4.5.0-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:a5f0bcdd6c84acc3f3a4c1f0fb96be7e9cff7a0bdff85f2f13caa80d2a9fac8f"}, + {file = "docling_parse-4.5.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4c8906d076219a18f4f86b1fec4e4cc3699460e78c88a5731ead48dfbb71835a"}, + {file = "docling_parse-4.5.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d84186662e4780375de28b1bcb18112b04bd8e6aedb787d96544cc0d687f9629"}, + {file = "docling_parse-4.5.0-cp312-cp312-win_amd64.whl", hash = "sha256:5688fe4281dac16e807496c0b19587e25c53a9542d12f36b3a8fb2e66de78eb2"}, + {file = "docling_parse-4.5.0-cp313-cp313-macosx_13_0_x86_64.whl", hash = "sha256:d8b2a25262a09e956516c4439ae143a66a55212f0ef9945928159caf1346408f"}, + {file = "docling_parse-4.5.0-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:368ebdb22ec03aa29b25d2684e51c74f6e167ab6809cd7bb5bb5b97cfe21bf8c"}, + {file = "docling_parse-4.5.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b7c9e8954118331438eb8da6058da0e3caf12735b47a86af9521e44465bbb2d4"}, + {file = "docling_parse-4.5.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:24360a0985a8f76ff99c39e533d208bb57427caf96b9ceb585090cd10558f87a"}, + {file = "docling_parse-4.5.0-cp313-cp313-win_amd64.whl", hash = "sha256:c3dba06a3cb8797587c90f5aa10cc2c51803d8f5cd67342ea948288a30503868"}, + {file = "docling_parse-4.5.0-cp39-cp39-macosx_13_0_x86_64.whl", hash = "sha256:217fe2466ca2723bdecbdb162ca73891c1746ec15b8d99ec203f8df3305091a5"}, + {file = "docling_parse-4.5.0-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:e8b283a93860cdf43a93296e1721e25daeb8eede14417b9f188f0f52c010d6b5"}, + {file = "docling_parse-4.5.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:affdecc41ed18f1a82c56edac2b815535e3cc07e2b0f8ffaee7e4adfb1333f0e"}, + {file = "docling_parse-4.5.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:da6e535463bcb19a64f3099bb73b299e1f6f49a1ef3b0b3ea4fa62e2790ad875"}, + {file = "docling_parse-4.5.0-cp39-cp39-win_amd64.whl", hash = "sha256:dac5e9907cd6fd020bc1620082dacb9b99bfc9ee4001c55c4e4ce156edf3b617"}, + {file = "docling_parse-4.5.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:f983d65703a165b76775c3e4b2a5cade4757216eb88faf5c0c86a9b33f38549a"}, + {file = "docling_parse-4.5.0-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:9d02c43d3185f5f4a6d5aaad38e69e07bbd1f965fd62f331bd9dfc006a637604"}, + {file = "docling_parse-4.5.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:9bf94bc213bedd6d880d94eface2285e9e344da5452a23b3a8d0fedecb5d3ec1"}, + {file = "docling_parse-4.5.0.tar.gz", hash = "sha256:e78f648c3a8af5ddb7dcc30c6c4270e9d3257366396a020ad60657de98bf88f5"}, +] + +[package.dependencies] +docling-core = ">=2.44.1" +pillow = ">=10.0.0,<12.0.0" +pydantic = ">=2.0.0" +pywin32 = {version = ">=305", markers = "sys_platform == \"win32\""} +tabulate = ">=0.9.0,<1.0.0" + +[package.extras] +perf-tools = ["pdfplumber (>=0.11.7)", "pymupdf (>=1.26.4)", "pypdfium2 (>=4.30.0)"] + [[package]] name = "docstring-parser" version = "0.17.0" @@ -2111,7 +2320,7 @@ description = "An implementation of lxml.xmlfile for the standard library" optional = true python-versions = ">=3.8" groups = ["main"] -markers = "extra == \"docs\"" +markers = "extra == \"docs\" or extra == \"docling\"" files = [ {file = "et_xmlfile-2.0.0-py3-none-any.whl", hash = "sha256:7a91720bc756843502c3b7504c77b8fe44217c85c537d85037f0f536151b2caa"}, {file = "et_xmlfile-2.0.0.tar.gz", hash = "sha256:dab3f4764309081ce75662649be815c4c9081e88f0837825f90fd28317d4da54"}, @@ -2184,21 +2393,20 @@ files = [ tests = ["asttokens (>=2.1.0)", "coverage", "coverage-enable-subprocess", "ipython", "littleutils", "pytest", "rich ; python_version >= \"3.11\""] [[package]] -name = "falkordb" -version = "1.2.0" -description = "Python client for interacting with FalkorDB database" +name = "faker" +version = "37.11.0" +description = "Faker is a Python package that generates fake data for you." optional = true -python-versions = "<4.0,>=3.8" +python-versions = ">=3.9" groups = ["main"] -markers = "extra == \"falkordb\"" +markers = "extra == \"docling\"" files = [ - {file = "falkordb-1.2.0-py3-none-any.whl", hash = "sha256:7572d9cc377735d22efc52fe6fe73c7a435422c827b6ea3ca223a850a77be12e"}, - {file = "falkordb-1.2.0.tar.gz", hash = "sha256:ce57365b86722d538e75aa5d438de67ecd8eb9478da612506d9812cd7f182d0b"}, + {file = "faker-37.11.0-py3-none-any.whl", hash = "sha256:1508d2da94dfd1e0087b36f386126d84f8583b3de19ac18e392a2831a6676c57"}, + {file = "faker-37.11.0.tar.gz", hash = "sha256:22969803849ba0618be8eee2dd01d0d9e2cd3b75e6ff1a291fa9abcdb34da5e6"}, ] [package.dependencies] -python-dateutil = ">=2.9.0,<3.0.0" -redis = ">=5.0.1,<6.0.0" +tzdata = "*" [[package]] name = "fastapi" @@ -2722,28 +2930,6 @@ files = [ {file = "giturlparse-0.12.0.tar.gz", hash = "sha256:c0fff7c21acc435491b1779566e038757a205c1ffdcb47e4f81ea52ad8c3859a"}, ] -[[package]] -name = "google-ai-generativelanguage" -version = "0.6.15" -description = "Google Ai Generativelanguage API client library" -optional = true -python-versions = ">=3.7" -groups = ["main"] -markers = "extra == \"gemini\"" -files = [ - {file = "google_ai_generativelanguage-0.6.15-py3-none-any.whl", hash = "sha256:5a03ef86377aa184ffef3662ca28f19eeee158733e45d7947982eb953c6ebb6c"}, - {file = "google_ai_generativelanguage-0.6.15.tar.gz", hash = "sha256:8f6d9dc4c12b065fe2d0289026171acea5183ebf2d0b11cefe12f3821e159ec3"}, -] - -[package.dependencies] -google-api-core = {version = ">=1.34.1,<2.0.dev0 || >=2.11.dev0,<3.0.0dev", extras = ["grpc"]} -google-auth = ">=2.14.1,<2.24.0 || >2.24.0,<2.25.0 || >2.25.0,<3.0.0dev" -proto-plus = [ - {version = ">=1.22.3,<2.0.0dev"}, - {version = ">=1.25.0,<2.0.0dev", markers = "python_version >= \"3.13\""}, -] -protobuf = ">=3.20.2,<4.21.0 || >4.21.0,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4.21.4 || >4.21.4,<4.21.5 || >4.21.5,<6.0.0dev" - [[package]] name = "google-api-core" version = "2.25.1" @@ -2751,7 +2937,7 @@ description = "Google API client core library" optional = true python-versions = ">=3.7" groups = ["main"] -markers = "extra == \"gemini\" or extra == \"docs\"" +markers = "extra == \"docs\"" files = [ {file = "google_api_core-2.25.1-py3-none-any.whl", hash = "sha256:8a2a56c1fef82987a524371f99f3bd0143702fecc670c72e600c1cda6bf8dbb7"}, {file = "google_api_core-2.25.1.tar.gz", hash = "sha256:d2aaa0b13c78c61cb3f4282c464c046e45fbd75755683c9c525e6e8f7ed0a5e8"}, @@ -2781,26 +2967,6 @@ grpc = ["grpcio (>=1.33.2,<2.0.0)", "grpcio (>=1.49.1,<2.0.0) ; python_version > grpcgcp = ["grpcio-gcp (>=0.2.2,<1.0.0)"] grpcio-gcp = ["grpcio-gcp (>=0.2.2,<1.0.0)"] -[[package]] -name = "google-api-python-client" -version = "2.182.0" -description = "Google API Client Library for Python" -optional = true -python-versions = ">=3.7" -groups = ["main"] -markers = "extra == \"gemini\"" -files = [ - {file = "google_api_python_client-2.182.0-py3-none-any.whl", hash = "sha256:a9b071036d41a17991d8fbf27bedb61f2888a39ae5696cb5a326bf999b2d5209"}, - {file = "google_api_python_client-2.182.0.tar.gz", hash = "sha256:cb2aa127e33c3a31e89a06f39cf9de982db90a98dee020911b21013afafad35f"}, -] - -[package.dependencies] -google-api-core = ">=1.31.5,<2.0.dev0 || >2.3.0,<3.0.0" -google-auth = ">=1.32.0,<2.24.0 || >2.24.0,<2.25.0 || >2.25.0,<3.0.0" -google-auth-httplib2 = ">=0.2.0,<1.0.0" -httplib2 = ">=0.19.0,<1.0.0" -uritemplate = ">=3.0.1,<5" - [[package]] name = "google-auth" version = "2.40.3" @@ -2808,7 +2974,7 @@ description = "Google Authentication Library" optional = true python-versions = ">=3.7" groups = ["main"] -markers = "extra == \"gemini\" or extra == \"docs\" or extra == \"deepeval\" or extra == \"chromadb\"" +markers = "extra == \"deepeval\" or extra == \"chromadb\" or extra == \"docs\"" files = [ {file = "google_auth-2.40.3-py2.py3-none-any.whl", hash = "sha256:1370d4593e86213563547f97a92752fc658456fe4514c809544f330fed45a7ca"}, {file = "google_auth-2.40.3.tar.gz", hash = "sha256:500c3a29adedeb36ea9cf24b8d10858e152f2412e3ca37829b3fa18e33d63b77"}, @@ -2829,23 +2995,6 @@ requests = ["requests (>=2.20.0,<3.0.0)"] testing = ["aiohttp (<3.10.0)", "aiohttp (>=3.6.2,<4.0.0)", "aioresponses", "cryptography (<39.0.0) ; python_version < \"3.8\"", "cryptography (>=38.0.3)", "flask", "freezegun", "grpcio", "mock", "oauth2client", "packaging", "pyjwt (>=2.0)", "pyopenssl (<24.3.0)", "pyopenssl (>=20.0.0)", "pytest", "pytest-asyncio", "pytest-cov", "pytest-localserver", "pyu2f (>=0.1.5)", "requests (>=2.20.0,<3.0.0)", "responses", "urllib3"] urllib3 = ["packaging", "urllib3"] -[[package]] -name = "google-auth-httplib2" -version = "0.2.0" -description = "Google Authentication Library: httplib2 transport" -optional = true -python-versions = "*" -groups = ["main"] -markers = "extra == \"gemini\"" -files = [ - {file = "google-auth-httplib2-0.2.0.tar.gz", hash = "sha256:38aa7badf48f974f1eb9861794e9c0cb2a0511a4ec0679b1f886d108f5640e05"}, - {file = "google_auth_httplib2-0.2.0-py2.py3-none-any.whl", hash = "sha256:b65a0a2123300dd71281a7bf6e64d65a0759287df52729bdd1ae2e47dc311a3d"}, -] - -[package.dependencies] -google-auth = "*" -httplib2 = ">=0.19.0" - [[package]] name = "google-cloud-vision" version = "3.10.2" @@ -2895,31 +3044,6 @@ websockets = ">=13.0.0,<15.1.0" aiohttp = ["aiohttp (<4.0.0)"] local-tokenizer = ["protobuf", "sentencepiece (>=0.2.0)"] -[[package]] -name = "google-generativeai" -version = "0.8.5" -description = "Google Generative AI High level API client library and tools." -optional = true -python-versions = ">=3.9" -groups = ["main"] -markers = "extra == \"gemini\"" -files = [ - {file = "google_generativeai-0.8.5-py3-none-any.whl", hash = "sha256:22b420817fb263f8ed520b33285f45976d5b21e904da32b80d4fd20c055123a2"}, -] - -[package.dependencies] -google-ai-generativelanguage = "0.6.15" -google-api-core = "*" -google-api-python-client = "*" -google-auth = ">=2.15.0" -protobuf = "*" -pydantic = "*" -tqdm = "*" -typing-extensions = "*" - -[package.extras] -dev = ["Pillow", "absl-py", "black", "ipython", "nose2", "pandas", "pytype", "pyyaml"] - [[package]] name = "googleapis-common-protos" version = "1.70.0" @@ -2927,7 +3051,7 @@ description = "Common protobufs used in Google APIs" optional = true python-versions = ">=3.7" groups = ["main"] -markers = "extra == \"gemini\" or extra == \"docs\" or extra == \"deepeval\" or extra == \"chromadb\"" +markers = "extra == \"docs\" or extra == \"deepeval\" or extra == \"chromadb\"" files = [ {file = "googleapis_common_protos-1.70.0-py3-none-any.whl", hash = "sha256:b8bfcca8c25a2bb253e0e0b0adaf8c00773e5e6af6fd92397576680b807e0fd8"}, {file = "googleapis_common_protos-1.70.0.tar.gz", hash = "sha256:0e1b44e0ea153e6594f9f394fef15193a68aaaea2d843f83e2742717ca753257"}, @@ -2968,7 +3092,7 @@ description = "Lightweight in-process concurrent programming" optional = false python-versions = ">=3.9" groups = ["main"] -markers = "platform_machine == \"aarch64\" or platform_machine == \"ppc64le\" or platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"win32\" or platform_machine == \"WIN32\"" +markers = "platform_machine == \"aarch64\" or platform_machine == \"ppc64le\" or platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"win32\" or platform_machine == \"WIN32\" or extra == \"scraping\"" files = [ {file = "greenlet-3.2.4-cp310-cp310-macosx_11_0_universal2.whl", hash = "sha256:8c68325b0d0acf8d91dde4e6f930967dd52a5302cd4062932a6b2e7c2969f47c"}, {file = "greenlet-3.2.4-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:94385f101946790ae13da500603491f04a76b6e4c059dab271b3ce2e283b2590"}, @@ -3077,7 +3201,7 @@ description = "HTTP/2-based RPC framework" optional = true python-versions = ">=3.9" groups = ["main"] -markers = "extra == \"gemini\" or extra == \"docs\" or extra == \"deepeval\" or extra == \"chromadb\"" +markers = "extra == \"docs\" or extra == \"deepeval\" or extra == \"chromadb\"" files = [ {file = "grpcio-1.75.0-cp310-cp310-linux_armv7l.whl", hash = "sha256:1ec9cbaec18d9597c718b1ed452e61748ac0b36ba350d558f9ded1a94cc15ec7"}, {file = "grpcio-1.75.0-cp310-cp310-macosx_11_0_universal2.whl", hash = "sha256:7ee5ee42bfae8238b66a275f9ebcf6f295724375f2fa6f3b52188008b6380faf"}, @@ -3145,7 +3269,7 @@ description = "Status proto mapping for gRPC" optional = true python-versions = ">=3.9" groups = ["main"] -markers = "extra == \"gemini\" or extra == \"docs\"" +markers = "extra == \"docs\"" files = [ {file = "grpcio_status-1.71.2-py3-none-any.whl", hash = "sha256:803c98cb6a8b7dc6dbb785b1111aed739f241ab5e9da0bba96888aa74704cfd3"}, {file = "grpcio_status-1.71.2.tar.gz", hash = "sha256:c7a97e176df71cdc2c179cd1847d7fc86cca5832ad12e9798d7fed6b7a1aab50"}, @@ -3337,22 +3461,6 @@ http2 = ["h2 (>=3,<5)"] socks = ["socksio (==1.*)"] trio = ["trio (>=0.22.0,<1.0)"] -[[package]] -name = "httplib2" -version = "0.31.0" -description = "A comprehensive HTTP client library." -optional = true -python-versions = ">=3.6" -groups = ["main"] -markers = "extra == \"gemini\"" -files = [ - {file = "httplib2-0.31.0-py3-none-any.whl", hash = "sha256:b9cd78abea9b4e43a7714c6e0f8b6b8561a6fc1e95d5dbd367f5bf0ef35f5d24"}, - {file = "httplib2-0.31.0.tar.gz", hash = "sha256:ac7ab497c50975147d4f7b1ade44becc7df2f8954d42b38b3d69c515f531135c"}, -] - -[package.dependencies] -pyparsing = ">=3.0.4,<4" - [[package]] name = "httptools" version = "0.6.4" @@ -4020,6 +4128,22 @@ files = [ [package.extras] dev = ["build (==1.2.2.post1)", "coverage (==7.5.4) ; python_version < \"3.9\"", "coverage (==7.8.0) ; python_version >= \"3.9\"", "mypy (==1.14.1) ; python_version < \"3.9\"", "mypy (==1.15.0) ; python_version >= \"3.9\"", "pip (==25.0.1)", "pylint (==3.2.7) ; python_version < \"3.9\"", "pylint (==3.3.6) ; python_version >= \"3.9\"", "ruff (==0.11.2)", "twine (==6.1.0)", "uv (==0.6.11)"] +[[package]] +name = "jsonlines" +version = "3.1.0" +description = "Library with helpers for the jsonlines file format" +optional = true +python-versions = ">=3.6" +groups = ["main"] +markers = "extra == \"docling\"" +files = [ + {file = "jsonlines-3.1.0-py3-none-any.whl", hash = "sha256:632f5e38f93dfcb1ac8c4e09780b92af3a55f38f26e7c47ae85109d420b6ad39"}, + {file = "jsonlines-3.1.0.tar.gz", hash = "sha256:2579cb488d96f815b0eb81629e3e6b0332da0962a18fa3532958f7ba14a5c37f"}, +] + +[package.dependencies] +attrs = ">=19.2.0" + [[package]] name = "jsonpatch" version = "1.33" @@ -4079,6 +4203,19 @@ files = [ {file = "jsonpointer-3.0.0.tar.gz", hash = "sha256:2b2d729f2091522d61c3b31f82e11870f60b68f43fbc705cb76bf4b832af59ef"}, ] +[[package]] +name = "jsonref" +version = "1.1.0" +description = "jsonref is a library for automatic dereferencing of JSON Reference objects for Python." +optional = true +python-versions = ">=3.7" +groups = ["main"] +markers = "extra == \"docling\"" +files = [ + {file = "jsonref-1.1.0-py3-none-any.whl", hash = "sha256:590dc7773df6c21cbf948b5dac07a72a251db28b0238ceecce0a2abfa8ec30a9"}, + {file = "jsonref-1.1.0.tar.gz", hash = "sha256:32fe8e1d85af0fdefbebce950af85590b22b60f9e95443176adbde4e1ecea552"}, +] + [[package]] name = "jsonschema" version = "4.25.1" @@ -4770,6 +4907,19 @@ interegular = ["interegular (>=0.3.1,<0.4.0)"] nearley = ["js2py"] regex = ["regex"] +[[package]] +name = "latex2mathml" +version = "3.78.1" +description = "Pure Python library for LaTeX to MathML conversion" +optional = true +python-versions = ">=3.9" +groups = ["main"] +markers = "extra == \"docling\"" +files = [ + {file = "latex2mathml-3.78.1-py3-none-any.whl", hash = "sha256:f089b6d75e85b937f99693c93e8c16c0804008672c3dd2a3d25affd36f238100"}, + {file = "latex2mathml-3.78.1.tar.gz", hash = "sha256:f941db80bf41db33f31df87b304e8b588f8166b813b0257c11c98f7a9d0aac71"}, +] + [[package]] name = "limits" version = "4.8.0" @@ -4937,160 +5087,113 @@ dev = ["Sphinx (==8.1.3) ; python_version >= \"3.11\"", "build (==1.2.2) ; pytho [[package]] name = "lxml" -version = "6.0.2" +version = "4.9.4" description = "Powerful and Pythonic XML processing library combining libxml2/libxslt with the ElementTree API." optional = true -python-versions = ">=3.8" +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, != 3.4.*" groups = ["main"] -markers = "extra == \"docs\"" +markers = "extra == \"scraping\" or extra == \"docs\" or extra == \"docling\"" files = [ - {file = "lxml-6.0.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:e77dd455b9a16bbd2a5036a63ddbd479c19572af81b624e79ef422f929eef388"}, - {file = "lxml-6.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:5d444858b9f07cefff6455b983aea9a67f7462ba1f6cbe4a21e8bf6791bf2153"}, - {file = "lxml-6.0.2-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:f952dacaa552f3bb8834908dddd500ba7d508e6ea6eb8c52eb2d28f48ca06a31"}, - {file = "lxml-6.0.2-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:71695772df6acea9f3c0e59e44ba8ac50c4f125217e84aab21074a1a55e7e5c9"}, - {file = "lxml-6.0.2-cp310-cp310-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:17f68764f35fd78d7c4cc4ef209a184c38b65440378013d24b8aecd327c3e0c8"}, - {file = "lxml-6.0.2-cp310-cp310-manylinux_2_26_i686.manylinux_2_28_i686.whl", hash = "sha256:058027e261afed589eddcfe530fcc6f3402d7fd7e89bfd0532df82ebc1563dba"}, - {file = "lxml-6.0.2-cp310-cp310-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a8ffaeec5dfea5881d4c9d8913a32d10cfe3923495386106e4a24d45300ef79c"}, - {file = "lxml-6.0.2-cp310-cp310-manylinux_2_31_armv7l.whl", hash = "sha256:f2e3b1a6bb38de0bc713edd4d612969dd250ca8b724be8d460001a387507021c"}, - {file = "lxml-6.0.2-cp310-cp310-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:d6690ec5ec1cce0385cb20896b16be35247ac8c2046e493d03232f1c2414d321"}, - {file = "lxml-6.0.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:f2a50c3c1d11cad0ebebbac357a97b26aa79d2bcaf46f256551152aa85d3a4d1"}, - {file = "lxml-6.0.2-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:3efe1b21c7801ffa29a1112fab3b0f643628c30472d507f39544fd48e9549e34"}, - {file = "lxml-6.0.2-cp310-cp310-musllinux_1_2_riscv64.whl", hash = "sha256:59c45e125140b2c4b33920d21d83681940ca29f0b83f8629ea1a2196dc8cfe6a"}, - {file = "lxml-6.0.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:452b899faa64f1805943ec1c0c9ebeaece01a1af83e130b69cdefeda180bb42c"}, - {file = "lxml-6.0.2-cp310-cp310-win32.whl", hash = "sha256:1e786a464c191ca43b133906c6903a7e4d56bef376b75d97ccbb8ec5cf1f0a4b"}, - {file = "lxml-6.0.2-cp310-cp310-win_amd64.whl", hash = "sha256:dacf3c64ef3f7440e3167aa4b49aa9e0fb99e0aa4f9ff03795640bf94531bcb0"}, - {file = "lxml-6.0.2-cp310-cp310-win_arm64.whl", hash = "sha256:45f93e6f75123f88d7f0cfd90f2d05f441b808562bf0bc01070a00f53f5028b5"}, - {file = "lxml-6.0.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:13e35cbc684aadf05d8711a5d1b5857c92e5e580efa9a0d2be197199c8def607"}, - {file = "lxml-6.0.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3b1675e096e17c6fe9c0e8c81434f5736c0739ff9ac6123c87c2d452f48fc938"}, - {file = "lxml-6.0.2-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:8ac6e5811ae2870953390452e3476694196f98d447573234592d30488147404d"}, - {file = "lxml-6.0.2-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5aa0fc67ae19d7a64c3fe725dc9a1bb11f80e01f78289d05c6f62545affec438"}, - {file = "lxml-6.0.2-cp311-cp311-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:de496365750cc472b4e7902a485d3f152ecf57bd3ba03ddd5578ed8ceb4c5964"}, - {file = "lxml-6.0.2-cp311-cp311-manylinux_2_26_i686.manylinux_2_28_i686.whl", hash = "sha256:200069a593c5e40b8f6fc0d84d86d970ba43138c3e68619ffa234bc9bb806a4d"}, - {file = "lxml-6.0.2-cp311-cp311-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7d2de809c2ee3b888b59f995625385f74629707c9355e0ff856445cdcae682b7"}, - {file = "lxml-6.0.2-cp311-cp311-manylinux_2_31_armv7l.whl", hash = "sha256:b2c3da8d93cf5db60e8858c17684c47d01fee6405e554fb55018dd85fc23b178"}, - {file = "lxml-6.0.2-cp311-cp311-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:442de7530296ef5e188373a1ea5789a46ce90c4847e597856570439621d9c553"}, - {file = "lxml-6.0.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:2593c77efde7bfea7f6389f1ab249b15ed4aa5bc5cb5131faa3b843c429fbedb"}, - {file = "lxml-6.0.2-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:3e3cb08855967a20f553ff32d147e14329b3ae70ced6edc2f282b94afbc74b2a"}, - {file = "lxml-6.0.2-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:2ed6c667fcbb8c19c6791bbf40b7268ef8ddf5a96940ba9404b9f9a304832f6c"}, - {file = "lxml-6.0.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:b8f18914faec94132e5b91e69d76a5c1d7b0c73e2489ea8929c4aaa10b76bbf7"}, - {file = "lxml-6.0.2-cp311-cp311-win32.whl", hash = "sha256:6605c604e6daa9e0d7f0a2137bdc47a2e93b59c60a65466353e37f8272f47c46"}, - {file = "lxml-6.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:e5867f2651016a3afd8dd2c8238baa66f1e2802f44bc17e236f547ace6647078"}, - {file = "lxml-6.0.2-cp311-cp311-win_arm64.whl", hash = "sha256:4197fb2534ee05fd3e7afaab5d8bfd6c2e186f65ea7f9cd6a82809c887bd1285"}, - {file = "lxml-6.0.2-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:a59f5448ba2ceccd06995c95ea59a7674a10de0810f2ce90c9006f3cbc044456"}, - {file = "lxml-6.0.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:e8113639f3296706fbac34a30813929e29247718e88173ad849f57ca59754924"}, - {file = "lxml-6.0.2-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:a8bef9b9825fa8bc816a6e641bb67219489229ebc648be422af695f6e7a4fa7f"}, - {file = "lxml-6.0.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:65ea18d710fd14e0186c2f973dc60bb52039a275f82d3c44a0e42b43440ea534"}, - {file = "lxml-6.0.2-cp312-cp312-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c371aa98126a0d4c739ca93ceffa0fd7a5d732e3ac66a46e74339acd4d334564"}, - {file = "lxml-6.0.2-cp312-cp312-manylinux_2_26_i686.manylinux_2_28_i686.whl", hash = "sha256:700efd30c0fa1a3581d80a748157397559396090a51d306ea59a70020223d16f"}, - {file = "lxml-6.0.2-cp312-cp312-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:c33e66d44fe60e72397b487ee92e01da0d09ba2d66df8eae42d77b6d06e5eba0"}, - {file = "lxml-6.0.2-cp312-cp312-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:90a345bbeaf9d0587a3aaffb7006aa39ccb6ff0e96a57286c0cb2fd1520ea192"}, - {file = "lxml-6.0.2-cp312-cp312-manylinux_2_31_armv7l.whl", hash = "sha256:064fdadaf7a21af3ed1dcaa106b854077fbeada827c18f72aec9346847cd65d0"}, - {file = "lxml-6.0.2-cp312-cp312-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:fbc74f42c3525ac4ffa4b89cbdd00057b6196bcefe8bce794abd42d33a018092"}, - {file = "lxml-6.0.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6ddff43f702905a4e32bc24f3f2e2edfe0f8fde3277d481bffb709a4cced7a1f"}, - {file = "lxml-6.0.2-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:6da5185951d72e6f5352166e3da7b0dc27aa70bd1090b0eb3f7f7212b53f1bb8"}, - {file = "lxml-6.0.2-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:57a86e1ebb4020a38d295c04fc79603c7899e0df71588043eb218722dabc087f"}, - {file = "lxml-6.0.2-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:2047d8234fe735ab77802ce5f2297e410ff40f5238aec569ad7c8e163d7b19a6"}, - {file = "lxml-6.0.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:6f91fd2b2ea15a6800c8e24418c0775a1694eefc011392da73bc6cef2623b322"}, - {file = "lxml-6.0.2-cp312-cp312-win32.whl", hash = "sha256:3ae2ce7d6fedfb3414a2b6c5e20b249c4c607f72cb8d2bb7cc9c6ec7c6f4e849"}, - {file = "lxml-6.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:72c87e5ee4e58a8354fb9c7c84cbf95a1c8236c127a5d1b7683f04bed8361e1f"}, - {file = "lxml-6.0.2-cp312-cp312-win_arm64.whl", hash = "sha256:61cb10eeb95570153e0c0e554f58df92ecf5109f75eacad4a95baa709e26c3d6"}, - {file = "lxml-6.0.2-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:9b33d21594afab46f37ae58dfadd06636f154923c4e8a4d754b0127554eb2e77"}, - {file = "lxml-6.0.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:6c8963287d7a4c5c9a432ff487c52e9c5618667179c18a204bdedb27310f022f"}, - {file = "lxml-6.0.2-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:1941354d92699fb5ffe6ed7b32f9649e43c2feb4b97205f75866f7d21aa91452"}, - {file = "lxml-6.0.2-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:bb2f6ca0ae2d983ded09357b84af659c954722bbf04dea98030064996d156048"}, - {file = "lxml-6.0.2-cp313-cp313-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:eb2a12d704f180a902d7fa778c6d71f36ceb7b0d317f34cdc76a5d05aa1dd1df"}, - {file = "lxml-6.0.2-cp313-cp313-manylinux_2_26_i686.manylinux_2_28_i686.whl", hash = "sha256:6ec0e3f745021bfed19c456647f0298d60a24c9ff86d9d051f52b509663feeb1"}, - {file = "lxml-6.0.2-cp313-cp313-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:846ae9a12d54e368933b9759052d6206a9e8b250291109c48e350c1f1f49d916"}, - {file = "lxml-6.0.2-cp313-cp313-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ef9266d2aa545d7374938fb5c484531ef5a2ec7f2d573e62f8ce722c735685fd"}, - {file = "lxml-6.0.2-cp313-cp313-manylinux_2_31_armv7l.whl", hash = "sha256:4077b7c79f31755df33b795dc12119cb557a0106bfdab0d2c2d97bd3cf3dffa6"}, - {file = "lxml-6.0.2-cp313-cp313-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:a7c5d5e5f1081955358533be077166ee97ed2571d6a66bdba6ec2f609a715d1a"}, - {file = "lxml-6.0.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:8f8d0cbd0674ee89863a523e6994ac25fd5be9c8486acfc3e5ccea679bad2679"}, - {file = "lxml-6.0.2-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:2cbcbf6d6e924c28f04a43f3b6f6e272312a090f269eff68a2982e13e5d57659"}, - {file = "lxml-6.0.2-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:dfb874cfa53340009af6bdd7e54ebc0d21012a60a4e65d927c2e477112e63484"}, - {file = "lxml-6.0.2-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:fb8dae0b6b8b7f9e96c26fdd8121522ce5de9bb5538010870bd538683d30e9a2"}, - {file = "lxml-6.0.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:358d9adae670b63e95bc59747c72f4dc97c9ec58881d4627fe0120da0f90d314"}, - {file = "lxml-6.0.2-cp313-cp313-win32.whl", hash = "sha256:e8cd2415f372e7e5a789d743d133ae474290a90b9023197fd78f32e2dc6873e2"}, - {file = "lxml-6.0.2-cp313-cp313-win_amd64.whl", hash = "sha256:b30d46379644fbfc3ab81f8f82ae4de55179414651f110a1514f0b1f8f6cb2d7"}, - {file = "lxml-6.0.2-cp313-cp313-win_arm64.whl", hash = "sha256:13dcecc9946dca97b11b7c40d29fba63b55ab4170d3c0cf8c0c164343b9bfdcf"}, - {file = "lxml-6.0.2-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:b0c732aa23de8f8aec23f4b580d1e52905ef468afb4abeafd3fec77042abb6fe"}, - {file = "lxml-6.0.2-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:4468e3b83e10e0317a89a33d28f7aeba1caa4d1a6fd457d115dd4ffe90c5931d"}, - {file = "lxml-6.0.2-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:abd44571493973bad4598a3be7e1d807ed45aa2adaf7ab92ab7c62609569b17d"}, - {file = "lxml-6.0.2-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:370cd78d5855cfbffd57c422851f7d3864e6ae72d0da615fca4dad8c45d375a5"}, - {file = "lxml-6.0.2-cp314-cp314-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:901e3b4219fa04ef766885fb40fa516a71662a4c61b80c94d25336b4934b71c0"}, - {file = "lxml-6.0.2-cp314-cp314-manylinux_2_26_i686.manylinux_2_28_i686.whl", hash = "sha256:a4bf42d2e4cf52c28cc1812d62426b9503cdb0c87a6de81442626aa7d69707ba"}, - {file = "lxml-6.0.2-cp314-cp314-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:b2c7fdaa4d7c3d886a42534adec7cfac73860b89b4e5298752f60aa5984641a0"}, - {file = "lxml-6.0.2-cp314-cp314-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:98a5e1660dc7de2200b00d53fa00bcd3c35a3608c305d45a7bbcaf29fa16e83d"}, - {file = "lxml-6.0.2-cp314-cp314-manylinux_2_31_armv7l.whl", hash = "sha256:dc051506c30b609238d79eda75ee9cab3e520570ec8219844a72a46020901e37"}, - {file = "lxml-6.0.2-cp314-cp314-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:8799481bbdd212470d17513a54d568f44416db01250f49449647b5ab5b5dccb9"}, - {file = "lxml-6.0.2-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:9261bb77c2dab42f3ecd9103951aeca2c40277701eb7e912c545c1b16e0e4917"}, - {file = "lxml-6.0.2-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:65ac4a01aba353cfa6d5725b95d7aed6356ddc0a3cd734de00124d285b04b64f"}, - {file = "lxml-6.0.2-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:b22a07cbb82fea98f8a2fd814f3d1811ff9ed76d0fc6abc84eb21527596e7cc8"}, - {file = "lxml-6.0.2-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:d759cdd7f3e055d6bc8d9bec3ad905227b2e4c785dc16c372eb5b5e83123f48a"}, - {file = "lxml-6.0.2-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:945da35a48d193d27c188037a05fec5492937f66fb1958c24fc761fb9d40d43c"}, - {file = "lxml-6.0.2-cp314-cp314-win32.whl", hash = "sha256:be3aaa60da67e6153eb15715cc2e19091af5dc75faef8b8a585aea372507384b"}, - {file = "lxml-6.0.2-cp314-cp314-win_amd64.whl", hash = "sha256:fa25afbadead523f7001caf0c2382afd272c315a033a7b06336da2637d92d6ed"}, - {file = "lxml-6.0.2-cp314-cp314-win_arm64.whl", hash = "sha256:063eccf89df5b24e361b123e257e437f9e9878f425ee9aae3144c77faf6da6d8"}, - {file = "lxml-6.0.2-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:6162a86d86893d63084faaf4ff937b3daea233e3682fb4474db07395794fa80d"}, - {file = "lxml-6.0.2-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:414aaa94e974e23a3e92e7ca5b97d10c0cf37b6481f50911032c69eeb3991bba"}, - {file = "lxml-6.0.2-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:48461bd21625458dd01e14e2c38dd0aea69addc3c4f960c30d9f59d7f93be601"}, - {file = "lxml-6.0.2-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:25fcc59afc57d527cfc78a58f40ab4c9b8fd096a9a3f964d2781ffb6eb33f4ed"}, - {file = "lxml-6.0.2-cp314-cp314t-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5179c60288204e6ddde3f774a93350177e08876eaf3ab78aa3a3649d43eb7d37"}, - {file = "lxml-6.0.2-cp314-cp314t-manylinux_2_26_i686.manylinux_2_28_i686.whl", hash = "sha256:967aab75434de148ec80597b75062d8123cadf2943fb4281f385141e18b21338"}, - {file = "lxml-6.0.2-cp314-cp314t-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:d100fcc8930d697c6561156c6810ab4a508fb264c8b6779e6e61e2ed5e7558f9"}, - {file = "lxml-6.0.2-cp314-cp314t-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2ca59e7e13e5981175b8b3e4ab84d7da57993eeff53c07764dcebda0d0e64ecd"}, - {file = "lxml-6.0.2-cp314-cp314t-manylinux_2_31_armv7l.whl", hash = "sha256:957448ac63a42e2e49531b9d6c0fa449a1970dbc32467aaad46f11545be9af1d"}, - {file = "lxml-6.0.2-cp314-cp314t-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:b7fc49c37f1786284b12af63152fe1d0990722497e2d5817acfe7a877522f9a9"}, - {file = "lxml-6.0.2-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:e19e0643cc936a22e837f79d01a550678da8377d7d801a14487c10c34ee49c7e"}, - {file = "lxml-6.0.2-cp314-cp314t-musllinux_1_2_armv7l.whl", hash = "sha256:1db01e5cf14345628e0cbe71067204db658e2fb8e51e7f33631f5f4735fefd8d"}, - {file = "lxml-6.0.2-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:875c6b5ab39ad5291588aed6925fac99d0097af0dd62f33c7b43736043d4a2ec"}, - {file = "lxml-6.0.2-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:cdcbed9ad19da81c480dfd6dd161886db6096083c9938ead313d94b30aadf272"}, - {file = "lxml-6.0.2-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:80dadc234ebc532e09be1975ff538d154a7fa61ea5031c03d25178855544728f"}, - {file = "lxml-6.0.2-cp314-cp314t-win32.whl", hash = "sha256:da08e7bb297b04e893d91087df19638dc7a6bb858a954b0cc2b9f5053c922312"}, - {file = "lxml-6.0.2-cp314-cp314t-win_amd64.whl", hash = "sha256:252a22982dca42f6155125ac76d3432e548a7625d56f5a273ee78a5057216eca"}, - {file = "lxml-6.0.2-cp314-cp314t-win_arm64.whl", hash = "sha256:bb4c1847b303835d89d785a18801a883436cdfd5dc3d62947f9c49e24f0f5a2c"}, - {file = "lxml-6.0.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:a656ca105115f6b766bba324f23a67914d9c728dafec57638e2b92a9dcd76c62"}, - {file = "lxml-6.0.2-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:c54d83a2188a10ebdba573f16bd97135d06c9ef60c3dc495315c7a28c80a263f"}, - {file = "lxml-6.0.2-cp38-cp38-manylinux_2_26_i686.manylinux_2_28_i686.whl", hash = "sha256:1ea99340b3c729beea786f78c38f60f4795622f36e305d9c9be402201efdc3b7"}, - {file = "lxml-6.0.2-cp38-cp38-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:af85529ae8d2a453feee4c780d9406a5e3b17cee0dd75c18bd31adcd584debc3"}, - {file = "lxml-6.0.2-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:fe659f6b5d10fb5a17f00a50eb903eb277a71ee35df4615db573c069bcf967ac"}, - {file = "lxml-6.0.2-cp38-cp38-win32.whl", hash = "sha256:5921d924aa5468c939d95c9814fa9f9b5935a6ff4e679e26aaf2951f74043512"}, - {file = "lxml-6.0.2-cp38-cp38-win_amd64.whl", hash = "sha256:0aa7070978f893954008ab73bb9e3c24a7c56c054e00566a21b553dc18105fca"}, - {file = "lxml-6.0.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:2c8458c2cdd29589a8367c09c8f030f1d202be673f0ca224ec18590b3b9fb694"}, - {file = "lxml-6.0.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:3fee0851639d06276e6b387f1c190eb9d7f06f7f53514e966b26bae46481ec90"}, - {file = "lxml-6.0.2-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:b2142a376b40b6736dfc214fd2902409e9e3857eff554fed2d3c60f097e62a62"}, - {file = "lxml-6.0.2-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:a6b5b39cc7e2998f968f05309e666103b53e2edd01df8dc51b90d734c0825444"}, - {file = "lxml-6.0.2-cp39-cp39-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d4aec24d6b72ee457ec665344a29acb2d35937d5192faebe429ea02633151aad"}, - {file = "lxml-6.0.2-cp39-cp39-manylinux_2_26_i686.manylinux_2_28_i686.whl", hash = "sha256:b42f4d86b451c2f9d06ffb4f8bbc776e04df3ba070b9fe2657804b1b40277c48"}, - {file = "lxml-6.0.2-cp39-cp39-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6cdaefac66e8b8f30e37a9b4768a391e1f8a16a7526d5bc77a7928408ef68e93"}, - {file = "lxml-6.0.2-cp39-cp39-manylinux_2_31_armv7l.whl", hash = "sha256:b738f7e648735714bbb82bdfd030203360cfeab7f6e8a34772b3c8c8b820568c"}, - {file = "lxml-6.0.2-cp39-cp39-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:daf42de090d59db025af61ce6bdb2521f0f102ea0e6ea310f13c17610a97da4c"}, - {file = "lxml-6.0.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:66328dabea70b5ba7e53d94aa774b733cf66686535f3bc9250a7aab53a91caaf"}, - {file = "lxml-6.0.2-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:e237b807d68a61fc3b1e845407e27e5eb8ef69bc93fe8505337c1acb4ee300b6"}, - {file = "lxml-6.0.2-cp39-cp39-musllinux_1_2_riscv64.whl", hash = "sha256:ac02dc29fd397608f8eb15ac1610ae2f2f0154b03f631e6d724d9e2ad4ee2c84"}, - {file = "lxml-6.0.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:817ef43a0c0b4a77bd166dc9a09a555394105ff3374777ad41f453526e37f9cb"}, - {file = "lxml-6.0.2-cp39-cp39-win32.whl", hash = "sha256:bc532422ff26b304cfb62b328826bd995c96154ffd2bac4544f37dbb95ecaa8f"}, - {file = "lxml-6.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:995e783eb0374c120f528f807443ad5a83a656a8624c467ea73781fc5f8a8304"}, - {file = "lxml-6.0.2-cp39-cp39-win_arm64.whl", hash = "sha256:08b9d5e803c2e4725ae9e8559ee880e5328ed61aa0935244e0515d7d9dbec0aa"}, - {file = "lxml-6.0.2-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:e748d4cf8fef2526bb2a589a417eba0c8674e29ffcb570ce2ceca44f1e567bf6"}, - {file = "lxml-6.0.2-pp310-pypy310_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:4ddb1049fa0579d0cbd00503ad8c58b9ab34d1254c77bc6a5576d96ec7853dba"}, - {file = "lxml-6.0.2-pp310-pypy310_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:cb233f9c95f83707dae461b12b720c1af9c28c2d19208e1be03387222151daf5"}, - {file = "lxml-6.0.2-pp310-pypy310_pp73-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:bc456d04db0515ce3320d714a1eac7a97774ff0849e7718b492d957da4631dd4"}, - {file = "lxml-6.0.2-pp310-pypy310_pp73-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2613e67de13d619fd283d58bda40bff0ee07739f624ffee8b13b631abf33083d"}, - {file = "lxml-6.0.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:24a8e756c982c001ca8d59e87c80c4d9dcd4d9b44a4cbeb8d9be4482c514d41d"}, - {file = "lxml-6.0.2-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:1c06035eafa8404b5cf475bb37a9f6088b0aca288d4ccc9d69389750d5543700"}, - {file = "lxml-6.0.2-pp311-pypy311_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:c7d13103045de1bdd6fe5d61802565f1a3537d70cd3abf596aa0af62761921ee"}, - {file = "lxml-6.0.2-pp311-pypy311_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:0a3c150a95fbe5ac91de323aa756219ef9cf7fde5a3f00e2281e30f33fa5fa4f"}, - {file = "lxml-6.0.2-pp311-pypy311_pp73-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:60fa43be34f78bebb27812ed90f1925ec99560b0fa1decdb7d12b84d857d31e9"}, - {file = "lxml-6.0.2-pp311-pypy311_pp73-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:21c73b476d3cfe836be731225ec3421fa2f048d84f6df6a8e70433dff1376d5a"}, - {file = "lxml-6.0.2-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:27220da5be049e936c3aca06f174e8827ca6445a4353a1995584311487fc4e3e"}, - {file = "lxml-6.0.2.tar.gz", hash = "sha256:cd79f3367bd74b317dda655dc8fcfa304d9eb6e4fb06b7168c5cf27f96e0cd62"}, + {file = "lxml-4.9.4-cp27-cp27m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:e214025e23db238805a600f1f37bf9f9a15413c7bf5f9d6ae194f84980c78722"}, + {file = "lxml-4.9.4-cp27-cp27m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:ec53a09aee61d45e7dbe7e91252ff0491b6b5fee3d85b2d45b173d8ab453efc1"}, + {file = "lxml-4.9.4-cp27-cp27m-win32.whl", hash = "sha256:7d1d6c9e74c70ddf524e3c09d9dc0522aba9370708c2cb58680ea40174800013"}, + {file = "lxml-4.9.4-cp27-cp27m-win_amd64.whl", hash = "sha256:cb53669442895763e61df5c995f0e8361b61662f26c1b04ee82899c2789c8f69"}, + {file = "lxml-4.9.4-cp27-cp27mu-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:647bfe88b1997d7ae8d45dabc7c868d8cb0c8412a6e730a7651050b8c7289cf2"}, + {file = "lxml-4.9.4-cp27-cp27mu-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:4d973729ce04784906a19108054e1fd476bc85279a403ea1a72fdb051c76fa48"}, + {file = "lxml-4.9.4-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:056a17eaaf3da87a05523472ae84246f87ac2f29a53306466c22e60282e54ff8"}, + {file = "lxml-4.9.4-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:aaa5c173a26960fe67daa69aa93d6d6a1cd714a6eb13802d4e4bd1d24a530644"}, + {file = "lxml-4.9.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:647459b23594f370c1c01768edaa0ba0959afc39caeeb793b43158bb9bb6a663"}, + {file = "lxml-4.9.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:bdd9abccd0927673cffe601d2c6cdad1c9321bf3437a2f507d6b037ef91ea307"}, + {file = "lxml-4.9.4-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:00e91573183ad273e242db5585b52670eddf92bacad095ce25c1e682da14ed91"}, + {file = "lxml-4.9.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:a602ed9bd2c7d85bd58592c28e101bd9ff9c718fbde06545a70945ffd5d11868"}, + {file = "lxml-4.9.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:de362ac8bc962408ad8fae28f3967ce1a262b5d63ab8cefb42662566737f1dc7"}, + {file = "lxml-4.9.4-cp310-cp310-win32.whl", hash = "sha256:33714fcf5af4ff7e70a49731a7cc8fd9ce910b9ac194f66eaa18c3cc0a4c02be"}, + {file = "lxml-4.9.4-cp310-cp310-win_amd64.whl", hash = "sha256:d3caa09e613ece43ac292fbed513a4bce170681a447d25ffcbc1b647d45a39c5"}, + {file = "lxml-4.9.4-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:359a8b09d712df27849e0bcb62c6a3404e780b274b0b7e4c39a88826d1926c28"}, + {file = "lxml-4.9.4-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:43498ea734ccdfb92e1886dfedaebeb81178a241d39a79d5351ba2b671bff2b2"}, + {file = "lxml-4.9.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:4855161013dfb2b762e02b3f4d4a21cc7c6aec13c69e3bffbf5022b3e708dd97"}, + {file = "lxml-4.9.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:c71b5b860c5215fdbaa56f715bc218e45a98477f816b46cfde4a84d25b13274e"}, + {file = "lxml-4.9.4-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:9a2b5915c333e4364367140443b59f09feae42184459b913f0f41b9fed55794a"}, + {file = "lxml-4.9.4-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:d82411dbf4d3127b6cde7da0f9373e37ad3a43e89ef374965465928f01c2b979"}, + {file = "lxml-4.9.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:273473d34462ae6e97c0f4e517bd1bf9588aa67a1d47d93f760a1282640e24ac"}, + {file = "lxml-4.9.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:389d2b2e543b27962990ab529ac6720c3dded588cc6d0f6557eec153305a3622"}, + {file = "lxml-4.9.4-cp311-cp311-win32.whl", hash = "sha256:8aecb5a7f6f7f8fe9cac0bcadd39efaca8bbf8d1bf242e9f175cbe4c925116c3"}, + {file = "lxml-4.9.4-cp311-cp311-win_amd64.whl", hash = "sha256:c7721a3ef41591341388bb2265395ce522aba52f969d33dacd822da8f018aff8"}, + {file = "lxml-4.9.4-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:dbcb2dc07308453db428a95a4d03259bd8caea97d7f0776842299f2d00c72fc8"}, + {file = "lxml-4.9.4-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:01bf1df1db327e748dcb152d17389cf6d0a8c5d533ef9bab781e9d5037619229"}, + {file = "lxml-4.9.4-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:e8f9f93a23634cfafbad6e46ad7d09e0f4a25a2400e4a64b1b7b7c0fbaa06d9d"}, + {file = "lxml-4.9.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:3f3f00a9061605725df1816f5713d10cd94636347ed651abdbc75828df302b20"}, + {file = "lxml-4.9.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:953dd5481bd6252bd480d6ec431f61d7d87fdcbbb71b0d2bdcfc6ae00bb6fb10"}, + {file = "lxml-4.9.4-cp312-cp312-win32.whl", hash = "sha256:266f655d1baff9c47b52f529b5f6bec33f66042f65f7c56adde3fcf2ed62ae8b"}, + {file = "lxml-4.9.4-cp312-cp312-win_amd64.whl", hash = "sha256:f1faee2a831fe249e1bae9cbc68d3cd8a30f7e37851deee4d7962b17c410dd56"}, + {file = "lxml-4.9.4-cp35-cp35m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:23d891e5bdc12e2e506e7d225d6aa929e0a0368c9916c1fddefab88166e98b20"}, + {file = "lxml-4.9.4-cp35-cp35m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:e96a1788f24d03e8d61679f9881a883ecdf9c445a38f9ae3f3f193ab6c591c66"}, + {file = "lxml-4.9.4-cp36-cp36m-macosx_11_0_x86_64.whl", hash = "sha256:5557461f83bb7cc718bc9ee1f7156d50e31747e5b38d79cf40f79ab1447afd2d"}, + {file = "lxml-4.9.4-cp36-cp36m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:fdb325b7fba1e2c40b9b1db407f85642e32404131c08480dd652110fc908561b"}, + {file = "lxml-4.9.4-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3d74d4a3c4b8f7a1f676cedf8e84bcc57705a6d7925e6daef7a1e54ae543a197"}, + {file = "lxml-4.9.4-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:ac7674d1638df129d9cb4503d20ffc3922bd463c865ef3cb412f2c926108e9a4"}, + {file = "lxml-4.9.4-cp36-cp36m-manylinux_2_28_x86_64.whl", hash = "sha256:ddd92e18b783aeb86ad2132d84a4b795fc5ec612e3545c1b687e7747e66e2b53"}, + {file = "lxml-4.9.4-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2bd9ac6e44f2db368ef8986f3989a4cad3de4cd55dbdda536e253000c801bcc7"}, + {file = "lxml-4.9.4-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:bc354b1393dce46026ab13075f77b30e40b61b1a53e852e99d3cc5dd1af4bc85"}, + {file = "lxml-4.9.4-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:f836f39678cb47c9541f04d8ed4545719dc31ad850bf1832d6b4171e30d65d23"}, + {file = "lxml-4.9.4-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:9c131447768ed7bc05a02553d939e7f0e807e533441901dd504e217b76307745"}, + {file = "lxml-4.9.4-cp36-cp36m-win32.whl", hash = "sha256:bafa65e3acae612a7799ada439bd202403414ebe23f52e5b17f6ffc2eb98c2be"}, + {file = "lxml-4.9.4-cp36-cp36m-win_amd64.whl", hash = "sha256:6197c3f3c0b960ad033b9b7d611db11285bb461fc6b802c1dd50d04ad715c225"}, + {file = "lxml-4.9.4-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:7b378847a09d6bd46047f5f3599cdc64fcb4cc5a5a2dd0a2af610361fbe77b16"}, + {file = "lxml-4.9.4-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:1343df4e2e6e51182aad12162b23b0a4b3fd77f17527a78c53f0f23573663545"}, + {file = "lxml-4.9.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:6dbdacf5752fbd78ccdb434698230c4f0f95df7dd956d5f205b5ed6911a1367c"}, + {file = "lxml-4.9.4-cp37-cp37m-manylinux_2_28_x86_64.whl", hash = "sha256:506becdf2ecaebaf7f7995f776394fcc8bd8a78022772de66677c84fb02dd33d"}, + {file = "lxml-4.9.4-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:ca8e44b5ba3edb682ea4e6185b49661fc22b230cf811b9c13963c9f982d1d964"}, + {file = "lxml-4.9.4-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:9d9d5726474cbbef279fd709008f91a49c4f758bec9c062dfbba88eab00e3ff9"}, + {file = "lxml-4.9.4-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:bbdd69e20fe2943b51e2841fc1e6a3c1de460d630f65bde12452d8c97209464d"}, + {file = "lxml-4.9.4-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:8671622256a0859f5089cbe0ce4693c2af407bc053dcc99aadff7f5310b4aa02"}, + {file = "lxml-4.9.4-cp37-cp37m-win32.whl", hash = "sha256:dd4fda67f5faaef4f9ee5383435048ee3e11ad996901225ad7615bc92245bc8e"}, + {file = "lxml-4.9.4-cp37-cp37m-win_amd64.whl", hash = "sha256:6bee9c2e501d835f91460b2c904bc359f8433e96799f5c2ff20feebd9bb1e590"}, + {file = "lxml-4.9.4-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:1f10f250430a4caf84115b1e0f23f3615566ca2369d1962f82bef40dd99cd81a"}, + {file = "lxml-4.9.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:3b505f2bbff50d261176e67be24e8909e54b5d9d08b12d4946344066d66b3e43"}, + {file = "lxml-4.9.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:1449f9451cd53e0fd0a7ec2ff5ede4686add13ac7a7bfa6988ff6d75cff3ebe2"}, + {file = "lxml-4.9.4-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:4ece9cca4cd1c8ba889bfa67eae7f21d0d1a2e715b4d5045395113361e8c533d"}, + {file = "lxml-4.9.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:59bb5979f9941c61e907ee571732219fa4774d5a18f3fa5ff2df963f5dfaa6bc"}, + {file = "lxml-4.9.4-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:b1980dbcaad634fe78e710c8587383e6e3f61dbe146bcbfd13a9c8ab2d7b1192"}, + {file = "lxml-4.9.4-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:9ae6c3363261021144121427b1552b29e7b59de9d6a75bf51e03bc072efb3c37"}, + {file = "lxml-4.9.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:bcee502c649fa6351b44bb014b98c09cb00982a475a1912a9881ca28ab4f9cd9"}, + {file = "lxml-4.9.4-cp38-cp38-win32.whl", hash = "sha256:a8edae5253efa75c2fc79a90068fe540b197d1c7ab5803b800fccfe240eed33c"}, + {file = "lxml-4.9.4-cp38-cp38-win_amd64.whl", hash = "sha256:701847a7aaefef121c5c0d855b2affa5f9bd45196ef00266724a80e439220e46"}, + {file = "lxml-4.9.4-cp39-cp39-macosx_11_0_x86_64.whl", hash = "sha256:f610d980e3fccf4394ab3806de6065682982f3d27c12d4ce3ee46a8183d64a6a"}, + {file = "lxml-4.9.4-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:aa9b5abd07f71b081a33115d9758ef6077924082055005808f68feccb27616bd"}, + {file = "lxml-4.9.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:365005e8b0718ea6d64b374423e870648ab47c3a905356ab6e5a5ff03962b9a9"}, + {file = "lxml-4.9.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:16b9ec51cc2feab009e800f2c6327338d6ee4e752c76e95a35c4465e80390ccd"}, + {file = "lxml-4.9.4-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:a905affe76f1802edcac554e3ccf68188bea16546071d7583fb1b693f9cf756b"}, + {file = "lxml-4.9.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:fd814847901df6e8de13ce69b84c31fc9b3fb591224d6762d0b256d510cbf382"}, + {file = "lxml-4.9.4-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:91bbf398ac8bb7d65a5a52127407c05f75a18d7015a270fdd94bbcb04e65d573"}, + {file = "lxml-4.9.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:f99768232f036b4776ce419d3244a04fe83784bce871b16d2c2e984c7fcea847"}, + {file = "lxml-4.9.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:bb5bd6212eb0edfd1e8f254585290ea1dadc3687dd8fd5e2fd9a87c31915cdab"}, + {file = "lxml-4.9.4-cp39-cp39-win32.whl", hash = "sha256:88f7c383071981c74ec1998ba9b437659e4fd02a3c4a4d3efc16774eb108d0ec"}, + {file = "lxml-4.9.4-cp39-cp39-win_amd64.whl", hash = "sha256:936e8880cc00f839aa4173f94466a8406a96ddce814651075f95837316369899"}, + {file = "lxml-4.9.4-pp310-pypy310_pp73-macosx_11_0_x86_64.whl", hash = "sha256:f6c35b2f87c004270fa2e703b872fcc984d714d430b305145c39d53074e1ffe0"}, + {file = "lxml-4.9.4-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:606d445feeb0856c2b424405236a01c71af7c97e5fe42fbc778634faef2b47e4"}, + {file = "lxml-4.9.4-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:a1bdcbebd4e13446a14de4dd1825f1e778e099f17f79718b4aeaf2403624b0f7"}, + {file = "lxml-4.9.4-pp37-pypy37_pp73-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:0a08c89b23117049ba171bf51d2f9c5f3abf507d65d016d6e0fa2f37e18c0fc5"}, + {file = "lxml-4.9.4-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:232fd30903d3123be4c435fb5159938c6225ee8607b635a4d3fca847003134ba"}, + {file = "lxml-4.9.4-pp37-pypy37_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:231142459d32779b209aa4b4d460b175cadd604fed856f25c1571a9d78114771"}, + {file = "lxml-4.9.4-pp38-pypy38_pp73-macosx_11_0_x86_64.whl", hash = "sha256:520486f27f1d4ce9654154b4494cf9307b495527f3a2908ad4cb48e4f7ed7ef7"}, + {file = "lxml-4.9.4-pp38-pypy38_pp73-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:562778586949be7e0d7435fcb24aca4810913771f845d99145a6cee64d5b67ca"}, + {file = "lxml-4.9.4-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:a9e7c6d89c77bb2770c9491d988f26a4b161d05c8ca58f63fb1f1b6b9a74be45"}, + {file = "lxml-4.9.4-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:786d6b57026e7e04d184313c1359ac3d68002c33e4b1042ca58c362f1d09ff58"}, + {file = "lxml-4.9.4-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:95ae6c5a196e2f239150aa4a479967351df7f44800c93e5a975ec726fef005e2"}, + {file = "lxml-4.9.4-pp39-pypy39_pp73-macosx_11_0_x86_64.whl", hash = "sha256:9b556596c49fa1232b0fff4b0e69b9d4083a502e60e404b44341e2f8fb7187f5"}, + {file = "lxml-4.9.4-pp39-pypy39_pp73-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:cc02c06e9e320869d7d1bd323df6dd4281e78ac2e7f8526835d3d48c69060683"}, + {file = "lxml-4.9.4-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:857d6565f9aa3464764c2cb6a2e3c2e75e1970e877c188f4aeae45954a314e0c"}, + {file = "lxml-4.9.4-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:c42ae7e010d7d6bc51875d768110c10e8a59494855c3d4c348b068f5fb81fdcd"}, + {file = "lxml-4.9.4-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:f10250bb190fb0742e3e1958dd5c100524c2cc5096c67c8da51233f7448dc137"}, + {file = "lxml-4.9.4.tar.gz", hash = "sha256:b1541e50b78e15fa06a2670157a1962ef06591d4c998b998047fff5e3236880e"}, ] [package.extras] cssselect = ["cssselect (>=0.7)"] -html-clean = ["lxml_html_clean"] html5 = ["html5lib"] htmlsoup = ["BeautifulSoup4"] +source = ["Cython (==0.29.37)"] [[package]] name = "madoka" @@ -5177,6 +5280,24 @@ profiling = ["gprof2dot"] rtd = ["ipykernel", "jupyter_sphinx", "mdit-py-plugins (>=0.5.0)", "myst-parser", "pyyaml", "sphinx", "sphinx-book-theme (>=1.0,<2.0)", "sphinx-copybutton", "sphinx-design"] testing = ["coverage", "pytest", "pytest-cov", "pytest-regressions", "requests"] +[[package]] +name = "marko" +version = "2.2.1" +description = "A markdown parser with high extensibility." +optional = true +python-versions = ">=3.9" +groups = ["main"] +markers = "extra == \"docling\"" +files = [ + {file = "marko-2.2.1-py3-none-any.whl", hash = "sha256:31e9a18b35c113e506ace5594716fa3df2872f8955908e279bc551f3eb1f0db8"}, + {file = "marko-2.2.1.tar.gz", hash = "sha256:e29d7e071a3b0cb2f7cc4c500d55f893dc5a45d85a8298dde6cb4e4dffd794d3"}, +] + +[package.extras] +codehilite = ["pygments"] +repr = ["objprint"] +toc = ["python-slugify"] + [[package]] name = "markupsafe" version = "3.0.2" @@ -5893,6 +6014,34 @@ files = [ {file = "monotonic-1.6.tar.gz", hash = "sha256:3a55207bcfed53ddd5c5bae174524062935efed17792e9de2ad0205ce9ad63f7"}, ] +[[package]] +name = "mpire" +version = "2.10.2" +description = "A Python package for easy multiprocessing, but faster than multiprocessing" +optional = true +python-versions = "*" +groups = ["main"] +markers = "extra == \"docling\"" +files = [ + {file = "mpire-2.10.2-py3-none-any.whl", hash = "sha256:d627707f7a8d02aa4c7f7d59de399dec5290945ddf7fbd36cbb1d6ebb37a51fb"}, + {file = "mpire-2.10.2.tar.gz", hash = "sha256:f66a321e93fadff34585a4bfa05e95bd946cf714b442f51c529038eb45773d97"}, +] + +[package.dependencies] +multiprocess = [ + {version = "*", optional = true, markers = "python_version < \"3.11\" and extra == \"dill\""}, + {version = ">=0.70.15", optional = true, markers = "python_version >= \"3.11\" and extra == \"dill\""}, +] +pygments = ">=2.0" +pywin32 = {version = ">=301", markers = "platform_system == \"Windows\""} +tqdm = ">=4.27" + +[package.extras] +dashboard = ["flask"] +dill = ["multiprocess (>=0.70.15) ; python_version >= \"3.11\"", "multiprocess ; python_version < \"3.11\""] +docs = ["docutils (==0.17.1)", "sphinx (==3.2.1)", "sphinx-autodoc-typehints (==1.11.0)", "sphinx-rtd-theme (==0.5.0)", "sphinx-versions (==1.0.1)", "sphinxcontrib-images (==0.9.2)"] +testing = ["ipywidgets", "multiprocess (>=0.70.15) ; python_version >= \"3.11\"", "multiprocess ; python_version < \"3.11\"", "numpy", "pywin32 (>=301) ; platform_system == \"Windows\"", "rich"] + [[package]] name = "mpmath" version = "1.3.0" @@ -6051,6 +6200,39 @@ files = [ [package.dependencies] typing-extensions = {version = ">=4.1.0", markers = "python_version < \"3.11\""} +[[package]] +name = "multiprocess" +version = "0.70.18" +description = "better multiprocessing and multithreading in Python" +optional = true +python-versions = ">=3.8" +groups = ["main"] +markers = "extra == \"docling\"" +files = [ + {file = "multiprocess-0.70.18-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:25d4012dcaaf66b9e8e955f58482b42910c2ee526d532844d8bcf661bbc604df"}, + {file = "multiprocess-0.70.18-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:06b19433de0d02afe5869aec8931dd5c01d99074664f806c73896b0d9e527213"}, + {file = "multiprocess-0.70.18-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:6fa1366f994373aaf2d4738b0f56e707caeaa05486e97a7f71ee0853823180c2"}, + {file = "multiprocess-0.70.18-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:8b8940ae30139e04b076da6c5b83e9398585ebdf0f2ad3250673fef5b2ff06d6"}, + {file = "multiprocess-0.70.18-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:0929ba95831adb938edbd5fb801ac45e705ecad9d100b3e653946b7716cb6bd3"}, + {file = "multiprocess-0.70.18-pp311-pypy311_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:4d77f8e4bfe6c6e2e661925bbf9aed4d5ade9a1c6502d5dfc10129b9d1141797"}, + {file = "multiprocess-0.70.18-pp38-pypy38_pp73-macosx_10_9_arm64.whl", hash = "sha256:2dbaae9bffa1fb2d58077c0044ffe87a8c8974e90fcf778cdf90e139c970d42a"}, + {file = "multiprocess-0.70.18-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:bcac5a4e81f1554d98d1bba963eeb1bd24966432f04fcbd29b6e1a16251ad712"}, + {file = "multiprocess-0.70.18-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:c0c7cd75d0987ab6166d64e654787c781dbacbcbcaaede4c1ffe664720b3e14b"}, + {file = "multiprocess-0.70.18-pp39-pypy39_pp73-macosx_10_13_arm64.whl", hash = "sha256:9fd8d662f7524a95a1be7cbea271f0b33089fe792baabec17d93103d368907da"}, + {file = "multiprocess-0.70.18-pp39-pypy39_pp73-macosx_10_13_x86_64.whl", hash = "sha256:3fbba48bfcd932747c33f0b152b26207c4e0840c35cab359afaff7a8672b1031"}, + {file = "multiprocess-0.70.18-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:5f9be0342e597dde86152c10442c5fb6c07994b1c29de441b7a3a08b0e6be2a0"}, + {file = "multiprocess-0.70.18-py310-none-any.whl", hash = "sha256:60c194974c31784019c1f459d984e8f33ee48f10fcf42c309ba97b30d9bd53ea"}, + {file = "multiprocess-0.70.18-py311-none-any.whl", hash = "sha256:5aa6eef98e691281b3ad923be2832bf1c55dd2c859acd73e5ec53a66aae06a1d"}, + {file = "multiprocess-0.70.18-py312-none-any.whl", hash = "sha256:9b78f8e5024b573730bfb654783a13800c2c0f2dfc0c25e70b40d184d64adaa2"}, + {file = "multiprocess-0.70.18-py313-none-any.whl", hash = "sha256:871743755f43ef57d7910a38433cfe41319e72be1bbd90b79c7a5ac523eb9334"}, + {file = "multiprocess-0.70.18-py38-none-any.whl", hash = "sha256:dbf705e52a154fe5e90fb17b38f02556169557c2dd8bb084f2e06c2784d8279b"}, + {file = "multiprocess-0.70.18-py39-none-any.whl", hash = "sha256:e78ca805a72b1b810c690b6b4cc32579eba34f403094bbbae962b7b5bf9dfcb8"}, + {file = "multiprocess-0.70.18.tar.gz", hash = "sha256:f9597128e6b3e67b23956da07cf3d2e5cba79e2f4e0fba8d7903636663ec6d0d"}, +] + +[package.dependencies] +dill = ">=0.4.0" + [[package]] name = "mypy" version = "1.18.2" @@ -6536,7 +6718,7 @@ description = "CUBLAS native runtime libraries" optional = true python-versions = ">=3" groups = ["main"] -markers = "platform_machine == \"x86_64\" and extra == \"docs\" and platform_system == \"Linux\"" +markers = "platform_machine == \"x86_64\" and (extra == \"docs\" or extra == \"docling\") and platform_system == \"Linux\"" files = [ {file = "nvidia_cublas_cu12-12.8.4.1-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:b86f6dd8935884615a0683b663891d43781b819ac4f2ba2b0c9604676af346d0"}, {file = "nvidia_cublas_cu12-12.8.4.1-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:8ac4e771d5a348c551b2a426eda6193c19aa630236b418086020df5ba9667142"}, @@ -6550,7 +6732,7 @@ description = "CUDA profiling tools runtime libs." optional = true python-versions = ">=3" groups = ["main"] -markers = "platform_machine == \"x86_64\" and extra == \"docs\" and platform_system == \"Linux\"" +markers = "platform_machine == \"x86_64\" and (extra == \"docs\" or extra == \"docling\") and platform_system == \"Linux\"" files = [ {file = "nvidia_cuda_cupti_cu12-12.8.90-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:4412396548808ddfed3f17a467b104ba7751e6b58678a4b840675c56d21cf7ed"}, {file = "nvidia_cuda_cupti_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ea0cb07ebda26bb9b29ba82cda34849e73c166c18162d3913575b0c9db9a6182"}, @@ -6564,7 +6746,7 @@ description = "NVRTC native runtime libraries" optional = true python-versions = ">=3" groups = ["main"] -markers = "platform_machine == \"x86_64\" and extra == \"docs\" and platform_system == \"Linux\"" +markers = "platform_machine == \"x86_64\" and (extra == \"docs\" or extra == \"docling\") and platform_system == \"Linux\"" files = [ {file = "nvidia_cuda_nvrtc_cu12-12.8.93-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl", hash = "sha256:a7756528852ef889772a84c6cd89d41dfa74667e24cca16bb31f8f061e3e9994"}, {file = "nvidia_cuda_nvrtc_cu12-12.8.93-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:fc1fec1e1637854b4c0a65fb9a8346b51dd9ee69e61ebaccc82058441f15bce8"}, @@ -6578,7 +6760,7 @@ description = "CUDA Runtime native Libraries" optional = true python-versions = ">=3" groups = ["main"] -markers = "platform_machine == \"x86_64\" and extra == \"docs\" and platform_system == \"Linux\"" +markers = "platform_machine == \"x86_64\" and (extra == \"docs\" or extra == \"docling\") and platform_system == \"Linux\"" files = [ {file = "nvidia_cuda_runtime_cu12-12.8.90-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:52bf7bbee900262ffefe5e9d5a2a69a30d97e2bc5bb6cc866688caa976966e3d"}, {file = "nvidia_cuda_runtime_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:adade8dcbd0edf427b7204d480d6066d33902cab2a4707dcfc48a2d0fd44ab90"}, @@ -6592,7 +6774,7 @@ description = "cuDNN runtime libraries" optional = true python-versions = ">=3" groups = ["main"] -markers = "platform_machine == \"x86_64\" and extra == \"docs\" and platform_system == \"Linux\"" +markers = "platform_machine == \"x86_64\" and (extra == \"docs\" or extra == \"docling\") and platform_system == \"Linux\"" files = [ {file = "nvidia_cudnn_cu12-9.10.2.21-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:c9132cc3f8958447b4910a1720036d9eff5928cc3179b0a51fb6d167c6cc87d8"}, {file = "nvidia_cudnn_cu12-9.10.2.21-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:949452be657fa16687d0930933f032835951ef0892b37d2d53824d1a84dc97a8"}, @@ -6609,7 +6791,7 @@ description = "CUFFT native runtime libraries" optional = true python-versions = ">=3" groups = ["main"] -markers = "platform_machine == \"x86_64\" and extra == \"docs\" and platform_system == \"Linux\"" +markers = "platform_machine == \"x86_64\" and (extra == \"docs\" or extra == \"docling\") and platform_system == \"Linux\"" files = [ {file = "nvidia_cufft_cu12-11.3.3.83-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:848ef7224d6305cdb2a4df928759dca7b1201874787083b6e7550dd6765ce69a"}, {file = "nvidia_cufft_cu12-11.3.3.83-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:4d2dd21ec0b88cf61b62e6b43564355e5222e4a3fb394cac0db101f2dd0d4f74"}, @@ -6626,7 +6808,7 @@ description = "cuFile GPUDirect libraries" optional = true python-versions = ">=3" groups = ["main"] -markers = "platform_machine == \"x86_64\" and extra == \"docs\" and platform_system == \"Linux\"" +markers = "platform_machine == \"x86_64\" and (extra == \"docs\" or extra == \"docling\") and platform_system == \"Linux\"" files = [ {file = "nvidia_cufile_cu12-1.13.1.3-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1d069003be650e131b21c932ec3d8969c1715379251f8d23a1860554b1cb24fc"}, {file = "nvidia_cufile_cu12-1.13.1.3-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:4beb6d4cce47c1a0f1013d72e02b0994730359e17801d395bdcbf20cfb3bb00a"}, @@ -6639,7 +6821,7 @@ description = "CURAND native runtime libraries" optional = true python-versions = ">=3" groups = ["main"] -markers = "platform_machine == \"x86_64\" and extra == \"docs\" and platform_system == \"Linux\"" +markers = "platform_machine == \"x86_64\" and (extra == \"docs\" or extra == \"docling\") and platform_system == \"Linux\"" files = [ {file = "nvidia_curand_cu12-10.3.9.90-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:dfab99248034673b779bc6decafdc3404a8a6f502462201f2f31f11354204acd"}, {file = "nvidia_curand_cu12-10.3.9.90-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:b32331d4f4df5d6eefa0554c565b626c7216f87a06a4f56fab27c3b68a830ec9"}, @@ -6653,7 +6835,7 @@ description = "CUDA solver native runtime libraries" optional = true python-versions = ">=3" groups = ["main"] -markers = "platform_machine == \"x86_64\" and extra == \"docs\" and platform_system == \"Linux\"" +markers = "platform_machine == \"x86_64\" and (extra == \"docs\" or extra == \"docling\") and platform_system == \"Linux\"" files = [ {file = "nvidia_cusolver_cu12-11.7.3.90-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:db9ed69dbef9715071232caa9b69c52ac7de3a95773c2db65bdba85916e4e5c0"}, {file = "nvidia_cusolver_cu12-11.7.3.90-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:4376c11ad263152bd50ea295c05370360776f8c3427b30991df774f9fb26c450"}, @@ -6672,7 +6854,7 @@ description = "CUSPARSE native runtime libraries" optional = true python-versions = ">=3" groups = ["main"] -markers = "platform_machine == \"x86_64\" and extra == \"docs\" and platform_system == \"Linux\"" +markers = "platform_machine == \"x86_64\" and (extra == \"docs\" or extra == \"docling\") and platform_system == \"Linux\"" files = [ {file = "nvidia_cusparse_cu12-12.5.8.93-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:9b6c161cb130be1a07a27ea6923df8141f3c295852f4b260c65f18f3e0a091dc"}, {file = "nvidia_cusparse_cu12-12.5.8.93-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1ec05d76bbbd8b61b06a80e1eaf8cf4959c3d4ce8e711b65ebd0443bb0ebb13b"}, @@ -6689,7 +6871,7 @@ description = "NVIDIA cuSPARSELt" optional = true python-versions = "*" groups = ["main"] -markers = "platform_machine == \"x86_64\" and extra == \"docs\" and platform_system == \"Linux\"" +markers = "platform_machine == \"x86_64\" and (extra == \"docs\" or extra == \"docling\") and platform_system == \"Linux\"" files = [ {file = "nvidia_cusparselt_cu12-0.7.1-py3-none-manylinux2014_aarch64.whl", hash = "sha256:8878dce784d0fac90131b6817b607e803c36e629ba34dc5b433471382196b6a5"}, {file = "nvidia_cusparselt_cu12-0.7.1-py3-none-manylinux2014_x86_64.whl", hash = "sha256:f1bb701d6b930d5a7cea44c19ceb973311500847f81b634d802b7b539dc55623"}, @@ -6703,7 +6885,7 @@ description = "NVIDIA Collective Communication Library (NCCL) Runtime" optional = true python-versions = ">=3" groups = ["main"] -markers = "platform_machine == \"x86_64\" and extra == \"docs\" and platform_system == \"Linux\"" +markers = "platform_machine == \"x86_64\" and (extra == \"docs\" or extra == \"docling\") and platform_system == \"Linux\"" files = [ {file = "nvidia_nccl_cu12-2.27.3-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:9ddf1a245abc36c550870f26d537a9b6087fb2e2e3d6e0ef03374c6fd19d984f"}, {file = "nvidia_nccl_cu12-2.27.3-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:adf27ccf4238253e0b826bce3ff5fa532d65fc42322c8bfdfaf28024c0fbe039"}, @@ -6716,7 +6898,7 @@ description = "Nvidia JIT LTO Library" optional = true python-versions = ">=3" groups = ["main"] -markers = "platform_machine == \"x86_64\" and extra == \"docs\" and platform_system == \"Linux\"" +markers = "platform_machine == \"x86_64\" and (extra == \"docs\" or extra == \"docling\") and platform_system == \"Linux\"" files = [ {file = "nvidia_nvjitlink_cu12-12.8.93-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl", hash = "sha256:81ff63371a7ebd6e6451970684f916be2eab07321b73c9d244dc2b4da7f73b88"}, {file = "nvidia_nvjitlink_cu12-12.8.93-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:adccd7161ace7261e01bb91e44e88da350895c270d23f744f0820c818b7229e7"}, @@ -6730,7 +6912,7 @@ description = "NVIDIA Tools Extension" optional = true python-versions = ">=3" groups = ["main"] -markers = "platform_machine == \"x86_64\" and extra == \"docs\" and platform_system == \"Linux\"" +markers = "platform_machine == \"x86_64\" and (extra == \"docs\" or extra == \"docling\") and platform_system == \"Linux\"" files = [ {file = "nvidia_nvtx_cu12-12.8.90-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d7ad891da111ebafbf7e015d34879f7112832fc239ff0d7d776b6cb685274615"}, {file = "nvidia_nvtx_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5b17e2001cc0d751a5bc2c6ec6d26ad95913324a4adb86788c944f8ce9ba441f"}, @@ -6755,6 +6937,24 @@ rsa = ["cryptography (>=3.0.0)"] signals = ["blinker (>=1.4.0)"] signedtoken = ["cryptography (>=3.0.0)", "pyjwt (>=2.0.0,<3)"] +[[package]] +name = "ocrmac" +version = "1.0.0" +description = "A python wrapper to extract text from images on a mac system. Uses the vision framework from Apple." +optional = true +python-versions = ">=3.6" +groups = ["main"] +markers = "sys_platform == \"darwin\" and extra == \"docling\"" +files = [ + {file = "ocrmac-1.0.0-py2.py3-none-any.whl", hash = "sha256:0b5a072aa23a9ead48132cb2d595b680aa6c3c5a6cb69525155e35ca95610c3a"}, + {file = "ocrmac-1.0.0.tar.gz", hash = "sha256:5b299e9030c973d1f60f82db000d6c2e5ff271601878c7db0885e850597d1d2e"}, +] + +[package.dependencies] +Click = ">=7.0" +pillow = "*" +pyobjc-framework-Vision = "*" + [[package]] name = "olefile" version = "0.47" @@ -6795,7 +6995,7 @@ description = "A flexible configuration library" optional = true python-versions = ">=3.6" groups = ["main"] -markers = "extra == \"docs\"" +markers = "extra == \"docling\" or extra == \"docs\"" files = [ {file = "omegaconf-2.3.0-py3-none-any.whl", hash = "sha256:7b4df175cdb08ba400f45cae3bdcae7ba8365db4d165fc65fd04b050ab63b46b"}, {file = "omegaconf-2.3.0.tar.gz", hash = "sha256:d5d4b6d29955cc50ad50c46dc269bcd92c6e00f5f90d23ab5fee7bfca4ba4cc7"}, @@ -6930,7 +7130,7 @@ description = "Wrapper package for OpenCV python bindings." optional = true python-versions = ">=3.6" groups = ["main"] -markers = "extra == \"docs\"" +markers = "extra == \"docling\" or extra == \"docs\"" files = [ {file = "opencv-python-4.11.0.86.tar.gz", hash = "sha256:03d60ccae62304860d232272e4a4fda93c39d595780cb40b161b310244b736a4"}, {file = "opencv_python-4.11.0.86-cp37-abi3-macosx_13_0_arm64.whl", hash = "sha256:432f67c223f1dc2824f5e73cdfcd9db0efc8710647d4e813012195dc9122a52a"}, @@ -6949,6 +7149,32 @@ numpy = [ {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, ] +[[package]] +name = "opencv-python-headless" +version = "4.11.0.86" +description = "Wrapper package for OpenCV python bindings." +optional = true +python-versions = ">=3.6" +groups = ["main"] +markers = "extra == \"docling\"" +files = [ + {file = "opencv-python-headless-4.11.0.86.tar.gz", hash = "sha256:996eb282ca4b43ec6a3972414de0e2331f5d9cda2b41091a49739c19fb843798"}, + {file = "opencv_python_headless-4.11.0.86-cp37-abi3-macosx_13_0_arm64.whl", hash = "sha256:48128188ade4a7e517237c8e1e11a9cdf5c282761473383e77beb875bb1e61ca"}, + {file = "opencv_python_headless-4.11.0.86-cp37-abi3-macosx_13_0_x86_64.whl", hash = "sha256:a66c1b286a9de872c343ee7c3553b084244299714ebb50fbdcd76f07ebbe6c81"}, + {file = "opencv_python_headless-4.11.0.86-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6efabcaa9df731f29e5ea9051776715b1bdd1845d7c9530065c7951d2a2899eb"}, + {file = "opencv_python_headless-4.11.0.86-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0e0a27c19dd1f40ddff94976cfe43066fbbe9dfbb2ec1907d66c19caef42a57b"}, + {file = "opencv_python_headless-4.11.0.86-cp37-abi3-win32.whl", hash = "sha256:f447d8acbb0b6f2808da71fddd29c1cdd448d2bc98f72d9bb78a7a898fc9621b"}, + {file = "opencv_python_headless-4.11.0.86-cp37-abi3-win_amd64.whl", hash = "sha256:6c304df9caa7a6a5710b91709dd4786bf20a74d57672b3c31f7033cc638174ca"}, +] + +[package.dependencies] +numpy = [ + {version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\""}, + {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\""}, + {version = ">=1.23.5", markers = "python_version >= \"3.11\""}, + {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, +] + [[package]] name = "openpyxl" version = "3.1.5" @@ -6956,7 +7182,7 @@ description = "A Python library to read/write Excel 2010 xlsx/xlsm files" optional = true python-versions = ">=3.8" groups = ["main"] -markers = "extra == \"docs\"" +markers = "extra == \"docs\" or extra == \"docling\"" files = [ {file = "openpyxl-3.1.5-py2.py3-none-any.whl", hash = "sha256:5282c12b107bffeef825f4617dc029afaf41d0ea60823bbb665ef3079dc79de2"}, {file = "openpyxl-3.1.5.tar.gz", hash = "sha256:cf0e3cf56142039133628b5acffe8ef0c12bc902d2aadd3e0fe5878dc08d1050"}, @@ -7314,7 +7540,7 @@ description = "Powerful data structures for data analysis, time series, and stat optional = true python-versions = ">=3.9" groups = ["main"] -markers = "extra == \"docs\" or extra == \"evals\"" +markers = "extra == \"docs\" or extra == \"evals\" or extra == \"docling\"" files = [ {file = "pandas-2.3.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:52bc29a946304c360561974c6542d1dd628ddafa69134a7131fdfd6a5d7a1a35"}, {file = "pandas-2.3.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:220cc5c35ffaa764dd5bb17cf42df283b5cb7fdf49e10a7b053a06c9cb48ee2b"}, @@ -7887,6 +8113,29 @@ docs = ["furo (>=2024.8.6)", "proselint (>=0.14)", "sphinx (>=8.1.3)", "sphinx-a test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=8.3.4)", "pytest-cov (>=6)", "pytest-mock (>=3.14)"] type = ["mypy (>=1.14.1)"] +[[package]] +name = "playwright" +version = "1.55.0" +description = "A high-level API to automate web browsers" +optional = true +python-versions = ">=3.9" +groups = ["main"] +markers = "extra == \"scraping\"" +files = [ + {file = "playwright-1.55.0-py3-none-macosx_10_13_x86_64.whl", hash = "sha256:d7da108a95001e412effca4f7610de79da1637ccdf670b1ae3fdc08b9694c034"}, + {file = "playwright-1.55.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:8290cf27a5d542e2682ac274da423941f879d07b001f6575a5a3a257b1d4ba1c"}, + {file = "playwright-1.55.0-py3-none-macosx_11_0_universal2.whl", hash = "sha256:25b0d6b3fd991c315cca33c802cf617d52980108ab8431e3e1d37b5de755c10e"}, + {file = "playwright-1.55.0-py3-none-manylinux1_x86_64.whl", hash = "sha256:c6d4d8f6f8c66c483b0835569c7f0caa03230820af8e500c181c93509c92d831"}, + {file = "playwright-1.55.0-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:29a0777c4ce1273acf90c87e4ae2fe0130182100d99bcd2ae5bf486093044838"}, + {file = "playwright-1.55.0-py3-none-win32.whl", hash = "sha256:29e6d1558ad9d5b5c19cbec0a72f6a2e35e6353cd9f262e22148685b86759f90"}, + {file = "playwright-1.55.0-py3-none-win_amd64.whl", hash = "sha256:7eb5956473ca1951abb51537e6a0da55257bb2e25fc37c2b75af094a5c93736c"}, + {file = "playwright-1.55.0-py3-none-win_arm64.whl", hash = "sha256:012dc89ccdcbd774cdde8aeee14c08e0dd52ddb9135bf10e9db040527386bd76"}, +] + +[package.dependencies] +greenlet = ">=3.1.1,<4.0.0" +pyee = ">=13,<14" + [[package]] name = "plotly" version = "6.3.0" @@ -7919,7 +8168,7 @@ description = "plugin and hook calling mechanisms for python" optional = true python-versions = ">=3.9" groups = ["main"] -markers = "extra == \"deepeval\" or extra == \"dev\" or extra == \"dlt\"" +markers = "extra == \"deepeval\" or extra == \"dev\" or extra == \"dlt\" or extra == \"docling\"" files = [ {file = "pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746"}, {file = "pluggy-1.6.0.tar.gz", hash = "sha256:7dcc130b76258d33b90f61b658791dede3486c3e6bfb003ee5c9bfb396dd22f3"}, @@ -7942,6 +8191,32 @@ files = [ {file = "ply-3.11.tar.gz", hash = "sha256:00c7c1aaa88358b9c765b6d3000c6eec0ba42abca5351b095321aef446081da3"}, ] +[[package]] +name = "polyfactory" +version = "2.22.2" +description = "Mock data generation factories" +optional = true +python-versions = "<4.0,>=3.8" +groups = ["main"] +markers = "extra == \"docling\"" +files = [ + {file = "polyfactory-2.22.2-py3-none-any.whl", hash = "sha256:9bea58ac9a80375b4153cd60820f75e558b863e567e058794d28c6a52b84118a"}, + {file = "polyfactory-2.22.2.tar.gz", hash = "sha256:a3297aa0b004f2b26341e903795565ae88507c4d86e68b132c2622969028587a"}, +] + +[package.dependencies] +faker = ">=5.0.0" +typing-extensions = ">=4.6.0" + +[package.extras] +attrs = ["attrs (>=22.2.0)"] +beanie = ["beanie", "pydantic[email]", "pymongo (<4.9)"] +full = ["attrs", "beanie", "msgspec", "odmantic", "pydantic", "sqlalchemy"] +msgspec = ["msgspec"] +odmantic = ["odmantic (<1.0.0)", "pydantic[email]"] +pydantic = ["pydantic[email] (>=1.10)"] +sqlalchemy = ["sqlalchemy (>=1.4.29)"] + [[package]] name = "pondpond" version = "1.4.1" @@ -8165,6 +8440,19 @@ files = [ {file = "propcache-0.3.2.tar.gz", hash = "sha256:20d7d62e4e7ef05f221e0db2856b979540686342e7dd9973b815599c7057e168"}, ] +[[package]] +name = "protego" +version = "0.5.0" +description = "Pure-Python robots.txt parser with support for modern conventions" +optional = true +python-versions = ">=3.9" +groups = ["main"] +markers = "extra == \"scraping\"" +files = [ + {file = "protego-0.5.0-py3-none-any.whl", hash = "sha256:4237227840a67fdeec289a9b89652455b5657806388c17e1a556e160435f8fc5"}, + {file = "protego-0.5.0.tar.gz", hash = "sha256:225dee0acfcc71de8c6f7cef9c618e5a9d3e7baa7ae1470b8d076a064033c463"}, +] + [[package]] name = "proto-plus" version = "1.26.1" @@ -8172,7 +8460,7 @@ description = "Beautiful, Pythonic protocol buffers" optional = true python-versions = ">=3.7" groups = ["main"] -markers = "extra == \"gemini\" or extra == \"docs\"" +markers = "extra == \"docs\"" files = [ {file = "proto_plus-1.26.1-py3-none-any.whl", hash = "sha256:13285478c2dcf2abb829db158e1047e2f1e8d63a077d94263c2b88b043c75a66"}, {file = "proto_plus-1.26.1.tar.gz", hash = "sha256:21a515a4c4c0088a773899e23c7bbade3d18f9c66c73edd4c7ee3816bc96a012"}, @@ -8212,7 +8500,7 @@ description = "Cross-platform lib for process and system monitoring." optional = true python-versions = ">=3.6" groups = ["main"] -markers = "extra == \"notebook\" or extra == \"dev\" or extra == \"docs\"" +markers = "extra == \"notebook\" or extra == \"dev\" or extra == \"docs\" or extra == \"docling\"" files = [ {file = "psutil-7.1.0-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:76168cef4397494250e9f4e73eb3752b146de1dd950040b29186d0cce1d5ca13"}, {file = "psutil-7.1.0-cp36-abi3-macosx_11_0_arm64.whl", hash = "sha256:5d007560c8c372efdff9e4579c2846d71de737e4605f611437255e81efcca2c5"}, @@ -8244,6 +8532,7 @@ files = [ {file = "psycopg2-2.9.10-cp311-cp311-win_amd64.whl", hash = "sha256:0435034157049f6846e95103bd8f5a668788dd913a7c30162ca9503fdf542cb4"}, {file = "psycopg2-2.9.10-cp312-cp312-win32.whl", hash = "sha256:65a63d7ab0e067e2cdb3cf266de39663203d38d6a8ed97f5ca0cb315c73fe067"}, {file = "psycopg2-2.9.10-cp312-cp312-win_amd64.whl", hash = "sha256:4a579d6243da40a7b3182e0430493dbd55950c493d8c68f4eec0b302f6bbf20e"}, + {file = "psycopg2-2.9.10-cp313-cp313-win_amd64.whl", hash = "sha256:91fd603a2155da8d0cfcdbf8ab24a2d54bca72795b90d2a3ed2b6da8d979dee2"}, {file = "psycopg2-2.9.10-cp39-cp39-win32.whl", hash = "sha256:9d5b3b94b79a844a986d029eee38998232451119ad653aea42bb9220a8c5066b"}, {file = "psycopg2-2.9.10-cp39-cp39-win_amd64.whl", hash = "sha256:88138c8dedcbfa96408023ea2b0c369eda40fe5d75002c0964c78f46f11fa442"}, {file = "psycopg2-2.9.10.tar.gz", hash = "sha256:12ec0b40b0273f95296233e8750441339298e6a572f7039da5b260e3c8b60e11"}, @@ -8515,7 +8804,7 @@ description = "Pure-Python implementation of ASN.1 types and DER/BER/CER codecs optional = true python-versions = ">=3.8" groups = ["main"] -markers = "extra == \"gemini\" or extra == \"docs\" or extra == \"deepeval\" or extra == \"chromadb\"" +markers = "extra == \"deepeval\" or extra == \"chromadb\" or extra == \"docs\"" files = [ {file = "pyasn1-0.6.1-py3-none-any.whl", hash = "sha256:0d632f46f2ba09143da3a8afe9e33fb6f92fa2320ab7e886e2d0f7672af84629"}, {file = "pyasn1-0.6.1.tar.gz", hash = "sha256:6f580d2bdd84365380830acf45550f2511469f673cb4a5ae3857a3170128b034"}, @@ -8528,7 +8817,7 @@ description = "A collection of ASN.1-based protocols modules" optional = true python-versions = ">=3.8" groups = ["main"] -markers = "extra == \"gemini\" or extra == \"docs\" or extra == \"deepeval\" or extra == \"chromadb\"" +markers = "extra == \"deepeval\" or extra == \"chromadb\" or extra == \"docs\"" files = [ {file = "pyasn1_modules-0.4.2-py3-none-any.whl", hash = "sha256:29253a9207ce32b64c3ac6600edc75368f98473906e8fd1043bd6b5b1de2c14a"}, {file = "pyasn1_modules-0.4.2.tar.gz", hash = "sha256:677091de870a80aae844b1ca6134f54652fa2c8c5a52aa396440ac3106e941e6"}, @@ -8537,6 +8826,67 @@ files = [ [package.dependencies] pyasn1 = ">=0.6.1,<0.7.0" +[[package]] +name = "pyclipper" +version = "1.3.0.post6" +description = "Cython wrapper for the C++ translation of the Angus Johnson's Clipper library (ver. 6.4.2)" +optional = true +python-versions = "*" +groups = ["main"] +markers = "extra == \"docling\"" +files = [ + {file = "pyclipper-1.3.0.post6-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:fa0f5e78cfa8262277bb3d0225537b3c2a90ef68fd90a229d5d24cf49955dcf4"}, + {file = "pyclipper-1.3.0.post6-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:a01f182d8938c1dc515e8508ed2442f7eebd2c25c7d5cb29281f583c1a8008a4"}, + {file = "pyclipper-1.3.0.post6-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:640f20975727994d4abacd07396f564e9e5665ba5cb66ceb36b300c281f84fa4"}, + {file = "pyclipper-1.3.0.post6-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a63002f6bb0f1efa87c0b81634cbb571066f237067e23707dabf746306c92ba5"}, + {file = "pyclipper-1.3.0.post6-cp310-cp310-win32.whl", hash = "sha256:106b8622cd9fb07d80cbf9b1d752334c55839203bae962376a8c59087788af26"}, + {file = "pyclipper-1.3.0.post6-cp310-cp310-win_amd64.whl", hash = "sha256:9699e98862dadefd0bea2360c31fa61ca553c660cbf6fb44993acde1b959f58f"}, + {file = "pyclipper-1.3.0.post6-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:c4247e7c44b34c87acbf38f99d48fb1acaf5da4a2cf4dcd601a9b24d431be4ef"}, + {file = "pyclipper-1.3.0.post6-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:851b3e58106c62a5534a1201295fe20c21714dee2eda68081b37ddb0367e6caa"}, + {file = "pyclipper-1.3.0.post6-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:16cc1705a915896d2aff52131c427df02265631279eac849ebda766432714cc0"}, + {file = "pyclipper-1.3.0.post6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ace1f0753cf71c5c5f6488b8feef5dd0fa8b976ad86b24bb51f708f513df4aac"}, + {file = "pyclipper-1.3.0.post6-cp311-cp311-win32.whl", hash = "sha256:dbc828641667142751b1127fd5c4291663490cf05689c85be4c5bcc89aaa236a"}, + {file = "pyclipper-1.3.0.post6-cp311-cp311-win_amd64.whl", hash = "sha256:1c03f1ae43b18ee07730c3c774cc3cf88a10c12a4b097239b33365ec24a0a14a"}, + {file = "pyclipper-1.3.0.post6-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:6363b9d79ba1b5d8f32d1623e797c1e9f994600943402e68d5266067bdde173e"}, + {file = "pyclipper-1.3.0.post6-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:32cd7fb9c1c893eb87f82a072dbb5e26224ea7cebbad9dc306d67e1ac62dd229"}, + {file = "pyclipper-1.3.0.post6-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e3aab10e3c10ed8fa60c608fb87c040089b83325c937f98f06450cf9fcfdaf1d"}, + {file = "pyclipper-1.3.0.post6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:58eae2ff92a8cae1331568df076c4c5775bf946afab0068b217f0cf8e188eb3c"}, + {file = "pyclipper-1.3.0.post6-cp312-cp312-win32.whl", hash = "sha256:793b0aa54b914257aa7dc76b793dd4dcfb3c84011d48df7e41ba02b571616eaf"}, + {file = "pyclipper-1.3.0.post6-cp312-cp312-win_amd64.whl", hash = "sha256:d3f9da96f83b8892504923beb21a481cd4516c19be1d39eb57a92ef1c9a29548"}, + {file = "pyclipper-1.3.0.post6-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:f129284d2c7bcd213d11c0f35e1ae506a1144ce4954e9d1734d63b120b0a1b58"}, + {file = "pyclipper-1.3.0.post6-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:188fbfd1d30d02247f92c25ce856f5f3c75d841251f43367dbcf10935bc48f38"}, + {file = "pyclipper-1.3.0.post6-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d6d129d0c2587f2f5904d201a4021f859afbb45fada4261c9fdedb2205b09d23"}, + {file = "pyclipper-1.3.0.post6-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5c9c80b5c46eef38ba3f12dd818dc87f5f2a0853ba914b6f91b133232315f526"}, + {file = "pyclipper-1.3.0.post6-cp313-cp313-win32.whl", hash = "sha256:b15113ec4fc423b58e9ae80aa95cf5a0802f02d8f02a98a46af3d7d66ff0cc0e"}, + {file = "pyclipper-1.3.0.post6-cp313-cp313-win_amd64.whl", hash = "sha256:e5ff68fa770ac654c7974fc78792978796f068bd274e95930c0691c31e192889"}, + {file = "pyclipper-1.3.0.post6-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:c92e41301a8f25f9adcd90954512038ed5f774a2b8c04a4a9db261b78ff75e3a"}, + {file = "pyclipper-1.3.0.post6-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:04214d23cf79f4ddcde36e299dea9f23f07abb88fa47ef399bf0e819438bbefd"}, + {file = "pyclipper-1.3.0.post6-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:aa604f8665ade434f9eafcd23f89435057d5d09427dfb4554c5e6d19f6d8aa1a"}, + {file = "pyclipper-1.3.0.post6-cp36-cp36m-win32.whl", hash = "sha256:1fd56855ca92fa7eb0d8a71cf3a24b80b9724c8adcc89b385bbaa8924e620156"}, + {file = "pyclipper-1.3.0.post6-cp36-cp36m-win_amd64.whl", hash = "sha256:6893f9b701f3132d86018594d99b724200b937a3a3ddfe1be0432c4ff0284e6e"}, + {file = "pyclipper-1.3.0.post6-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:2737df106b8487103916147fe30f887aff439d9f2bd2f67c9d9b5c13eac88ccf"}, + {file = "pyclipper-1.3.0.post6-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:33ab72260f144693e1f7735e93276c3031e1ed243a207eff1f8b98c7162ba22c"}, + {file = "pyclipper-1.3.0.post6-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:491ec1bfd2ee3013269c2b652dde14a85539480e0fb82f89bb12198fa59fff82"}, + {file = "pyclipper-1.3.0.post6-cp37-cp37m-win32.whl", hash = "sha256:2e257009030815853528ba4b2ef7fb7e172683a3f4255a63f00bde34cfab8b58"}, + {file = "pyclipper-1.3.0.post6-cp37-cp37m-win_amd64.whl", hash = "sha256:ed6e50c6e87ed190141573615d54118869bd63e9cd91ca5660d2ca926bf25110"}, + {file = "pyclipper-1.3.0.post6-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:cf0a535cfa02b207435928e991c60389671fe1ea1dfae79170973f82f52335b2"}, + {file = "pyclipper-1.3.0.post6-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:48dd55fbd55f63902cad511432ec332368cbbbc1dd2110c0c6c1e9edd735713a"}, + {file = "pyclipper-1.3.0.post6-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c05ae2ea878fdfa31dd375326f6191b03de98a9602cc9c2b6d4ff960b20a974c"}, + {file = "pyclipper-1.3.0.post6-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:903176952a159c4195b8be55e597978e24804c838c7a9b12024c39704d341f72"}, + {file = "pyclipper-1.3.0.post6-cp38-cp38-win32.whl", hash = "sha256:fb1e52cf4ee0a9fa8b2254ed589cc51b0c989efc58fa8804289aca94a21253f7"}, + {file = "pyclipper-1.3.0.post6-cp38-cp38-win_amd64.whl", hash = "sha256:9cbdc517e75e647aa9bf6e356b3a3d2e3af344f82af38e36031eb46ba0ab5425"}, + {file = "pyclipper-1.3.0.post6-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:383f3433b968f2e4b0843f338c1f63b85392b6e1d936de722e8c5d4f577dbff5"}, + {file = "pyclipper-1.3.0.post6-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:cf5ca2b9358d30a395ac6e14b3154a9fd1f9b557ad7153ea15cf697e88d07ce1"}, + {file = "pyclipper-1.3.0.post6-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3404dfcb3415eee863564b5f49be28a8c7fb99ad5e31c986bcc33c8d47d97df7"}, + {file = "pyclipper-1.3.0.post6-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:aa0e7268f8ceba218964bc3a482a5e9d32e352e8c3538b03f69a6b3db979078d"}, + {file = "pyclipper-1.3.0.post6-cp39-cp39-win32.whl", hash = "sha256:47a214f201ff930595a30649c2a063f78baa3a8f52e1f38da19f7930c90ed80c"}, + {file = "pyclipper-1.3.0.post6-cp39-cp39-win_amd64.whl", hash = "sha256:28bb590ae79e6beb15794eaee12b6f1d769589572d33e494faf5aa3b1f31b9fa"}, + {file = "pyclipper-1.3.0.post6-pp37-pypy37_pp73-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:3e5e65176506da6335f6cbab497ae1a29772064467fa69f66de6bab4b6304d34"}, + {file = "pyclipper-1.3.0.post6-pp38-pypy38_pp73-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:3d58202de8b8da4d1559afbda4e90a8c260a5373672b6d7bc5448c4614385144"}, + {file = "pyclipper-1.3.0.post6-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e2cd8600bd16d209d5d45a33b45c278e1cc8bedc169af1a1f2187b581c521395"}, + {file = "pyclipper-1.3.0.post6.tar.gz", hash = "sha256:42bff0102fa7a7f2abdd795a2594654d62b786d0c6cd67b72d469114fdeb608c"}, +] + [[package]] name = "pycocotools" version = "2.0.10" @@ -8800,6 +9150,25 @@ gcp-secret-manager = ["google-cloud-secret-manager (>=2.23.1)"] toml = ["tomli (>=2.0.1)"] yaml = ["pyyaml (>=6.0.1)"] +[[package]] +name = "pyee" +version = "13.0.0" +description = "A rough port of Node.js's EventEmitter to Python with a few tricks of its own" +optional = true +python-versions = ">=3.8" +groups = ["main"] +markers = "extra == \"scraping\"" +files = [ + {file = "pyee-13.0.0-py3-none-any.whl", hash = "sha256:48195a3cddb3b1515ce0695ed76036b5ccc2ef3a9f963ff9f77aec0139845498"}, + {file = "pyee-13.0.0.tar.gz", hash = "sha256:b391e3c5a434d1f5118a25615001dbc8f669cf410ab67d04c4d4e07c55481c37"}, +] + +[package.dependencies] +typing-extensions = "*" + +[package.extras] +dev = ["black", "build", "flake8", "flake8-black", "isort", "jupyter-console", "mkdocs", "mkdocs-include-markdown-plugin", "mkdocstrings[python]", "mypy", "pytest", "pytest-asyncio ; python_version >= \"3.4\"", "pytest-trio ; python_version >= \"3.7\"", "sphinx", "toml", "tox", "trio", "trio ; python_version > \"3.6\"", "trio-typing ; python_version > \"3.6\"", "twine", "twisted", "validate-pyproject[all]"] + [[package]] name = "pyfiglet" version = "1.0.4" @@ -8876,6 +9245,18 @@ dev = ["pyright", "ruff (==0.4.1)"] tests = ["boto3", "datafusion (==49.0.0)", "datasets", "duckdb", "ml-dtypes", "pandas", "pillow", "polars[pandas,pyarrow]", "psutil", "pytest", "tensorflow (<=2.19.0)", "tqdm"] torch = ["torch"] +[[package]] +name = "pylatexenc" +version = "2.10" +description = "Simple LaTeX parser providing latex-to-unicode and unicode-to-latex conversion" +optional = true +python-versions = "*" +groups = ["main"] +markers = "extra == \"docling\"" +files = [ + {file = "pylatexenc-2.10.tar.gz", hash = "sha256:3dd8fd84eb46dc30bee1e23eaab8d8fb5a7f507347b23e5f38ad9675c84f40d3"}, +] + [[package]] name = "pylint" version = "3.3.8" @@ -8942,6 +9323,123 @@ files = [ [package.dependencies] pywin32 = {version = ">=226", markers = "platform_system == \"Windows\""} +[[package]] +name = "pyobjc-core" +version = "11.1" +description = "Python<->ObjC Interoperability Module" +optional = true +python-versions = ">=3.8" +groups = ["main"] +markers = "sys_platform == \"darwin\" and extra == \"docling\"" +files = [ + {file = "pyobjc_core-11.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:4c7536f3e94de0a3eae6bb382d75f1219280aa867cdf37beef39d9e7d580173c"}, + {file = "pyobjc_core-11.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:ec36680b5c14e2f73d432b03ba7c1457dc6ca70fa59fd7daea1073f2b4157d33"}, + {file = "pyobjc_core-11.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:765b97dea6b87ec4612b3212258024d8496ea23517c95a1c5f0735f96b7fd529"}, + {file = "pyobjc_core-11.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:18986f83998fbd5d3f56d8a8428b2f3e0754fd15cef3ef786ca0d29619024f2c"}, + {file = "pyobjc_core-11.1-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:8849e78cfe6595c4911fbba29683decfb0bf57a350aed8a43316976ba6f659d2"}, + {file = "pyobjc_core-11.1-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:8cb9ed17a8d84a312a6e8b665dd22393d48336ea1d8277e7ad20c19a38edf731"}, + {file = "pyobjc_core-11.1-cp314-cp314t-macosx_11_0_universal2.whl", hash = "sha256:f2455683e807f8541f0d83fbba0f5d9a46128ab0d5cc83ea208f0bec759b7f96"}, + {file = "pyobjc_core-11.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:4a99e6558b48b8e47c092051e7b3be05df1c8d0617b62f6fa6a316c01902d157"}, + {file = "pyobjc_core-11.1.tar.gz", hash = "sha256:b63d4d90c5df7e762f34739b39cc55bc63dbcf9fb2fb3f2671e528488c7a87fe"}, +] + +[[package]] +name = "pyobjc-framework-cocoa" +version = "11.1" +description = "Wrappers for the Cocoa frameworks on macOS" +optional = true +python-versions = ">=3.9" +groups = ["main"] +markers = "sys_platform == \"darwin\" and extra == \"docling\"" +files = [ + {file = "pyobjc_framework_cocoa-11.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:b27a5bdb3ab6cdeb998443ff3fce194ffae5f518c6a079b832dbafc4426937f9"}, + {file = "pyobjc_framework_cocoa-11.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:7b9a9b8ba07f5bf84866399e3de2aa311ed1c34d5d2788a995bdbe82cc36cfa0"}, + {file = "pyobjc_framework_cocoa-11.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:806de56f06dfba8f301a244cce289d54877c36b4b19818e3b53150eb7c2424d0"}, + {file = "pyobjc_framework_cocoa-11.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:54e93e1d9b0fc41c032582a6f0834befe1d418d73893968f3f450281b11603da"}, + {file = "pyobjc_framework_cocoa-11.1-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:fd5245ee1997d93e78b72703be1289d75d88ff6490af94462b564892e9266350"}, + {file = "pyobjc_framework_cocoa-11.1-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:aede53a1afc5433e1e7d66568cc52acceeb171b0a6005407a42e8e82580b4fc0"}, + {file = "pyobjc_framework_cocoa-11.1-cp314-cp314t-macosx_11_0_universal2.whl", hash = "sha256:1b5de4e1757bb65689d6dc1f8d8717de9ec8587eb0c4831c134f13aba29f9b71"}, + {file = "pyobjc_framework_cocoa-11.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:bbee71eeb93b1b31ffbac8560b59a0524a8a4b90846a260d2c4f2188f3d4c721"}, + {file = "pyobjc_framework_cocoa-11.1.tar.gz", hash = "sha256:87df76b9b73e7ca699a828ff112564b59251bb9bbe72e610e670a4dc9940d038"}, +] + +[package.dependencies] +pyobjc-core = ">=11.1" + +[[package]] +name = "pyobjc-framework-coreml" +version = "11.1" +description = "Wrappers for the framework CoreML on macOS" +optional = true +python-versions = ">=3.9" +groups = ["main"] +markers = "sys_platform == \"darwin\" and extra == \"docling\"" +files = [ + {file = "pyobjc_framework_coreml-11.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:b1b1b849ca91e0d62ed6dfd200d95ca8d023d6edff854aae77ba54eb0542415f"}, + {file = "pyobjc_framework_coreml-11.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:b5be7889ad99da1aca040238fd99af9ee87ea8a6628f24d33e2e4890b88dd139"}, + {file = "pyobjc_framework_coreml-11.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:c768b03d72488b964d753392e9c587684961d8237b69cca848b3a5a00aea79c9"}, + {file = "pyobjc_framework_coreml-11.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:10d51f8a5fe8d30c7ec70304a2324df76b48b9fbef30ee0f0c33b99a49ae8853"}, + {file = "pyobjc_framework_coreml-11.1-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:4df25ee233430f016ffcb4e88506b54c8e7b668c93197e6a1341761530a5922c"}, + {file = "pyobjc_framework_coreml-11.1-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:287a2a059016d02d8c40e0d29e70226142a4969db97ad79cefc70ec9bf0ab29e"}, + {file = "pyobjc_framework_coreml-11.1-cp314-cp314t-macosx_11_0_universal2.whl", hash = "sha256:a479c3d759aff3695f72c7915a78df6e92e0eca7027abaa8b4a07e876ba1dbfb"}, + {file = "pyobjc_framework_coreml-11.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:25e6e2185aefc46eb2a796eee6f4bef1cba3206f914b85ac659699468e9dc9a8"}, + {file = "pyobjc_framework_coreml-11.1.tar.gz", hash = "sha256:775923eefb9eac2e389c0821b10564372de8057cea89f1ea1cdaf04996c970a7"}, +] + +[package.dependencies] +pyobjc-core = ">=11.1" +pyobjc-framework-Cocoa = ">=11.1" + +[[package]] +name = "pyobjc-framework-quartz" +version = "11.1" +description = "Wrappers for the Quartz frameworks on macOS" +optional = true +python-versions = ">=3.9" +groups = ["main"] +markers = "sys_platform == \"darwin\" and extra == \"docling\"" +files = [ + {file = "pyobjc_framework_quartz-11.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:b5ef75c416b0209e25b2eb07a27bd7eedf14a8c6b2f968711969d45ceceb0f84"}, + {file = "pyobjc_framework_quartz-11.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:2d501fe95ef15d8acf587cb7dc4ab4be3c5a84e2252017da8dbb7df1bbe7a72a"}, + {file = "pyobjc_framework_quartz-11.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:9ac806067541917d6119b98d90390a6944e7d9bd737f5c0a79884202327c9204"}, + {file = "pyobjc_framework_quartz-11.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:43a1138280571bbf44df27a7eef519184b5c4183a588598ebaaeb887b9e73e76"}, + {file = "pyobjc_framework_quartz-11.1-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:b23d81c30c564adf6336e00b357f355b35aad10075dd7e837cfd52a9912863e5"}, + {file = "pyobjc_framework_quartz-11.1-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:07cbda78b4a8fcf3a2d96e047a2ff01f44e3e1820f46f0f4b3b6d77ff6ece07c"}, + {file = "pyobjc_framework_quartz-11.1-cp314-cp314t-macosx_11_0_universal2.whl", hash = "sha256:39d02a3df4b5e3eee1e0da0fb150259476910d2a9aa638ab94153c24317a9561"}, + {file = "pyobjc_framework_quartz-11.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:9b1f451ddb5243d8d6316af55f240a02b0fffbfe165bff325628bf73f3df7f44"}, + {file = "pyobjc_framework_quartz-11.1.tar.gz", hash = "sha256:a57f35ccfc22ad48c87c5932818e583777ff7276605fef6afad0ac0741169f75"}, +] + +[package.dependencies] +pyobjc-core = ">=11.1" +pyobjc-framework-Cocoa = ">=11.1" + +[[package]] +name = "pyobjc-framework-vision" +version = "11.1" +description = "Wrappers for the framework Vision on macOS" +optional = true +python-versions = ">=3.9" +groups = ["main"] +markers = "sys_platform == \"darwin\" and extra == \"docling\"" +files = [ + {file = "pyobjc_framework_vision-11.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:3c6f46df632096f070e16ba902a483fcb95c01fe12856a071bc2b25ac4a89bf3"}, + {file = "pyobjc_framework_vision-11.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:bfbde43c9d4296e1d26548b6d30ae413e2029425968cd8bce96d3c5a735e8f2c"}, + {file = "pyobjc_framework_vision-11.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:df076c3e3e672887182953efc934c1f9683304737e792ec09a29bfee90d2e26a"}, + {file = "pyobjc_framework_vision-11.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:1e5617e37dd2a7cff5e69e9aab039ea74b39ccdc528f6c828f2b60c1254e61e5"}, + {file = "pyobjc_framework_vision-11.1-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:dfd148a6df30ac70a9c41dd90a6c8f8c7f339bd9ca6829629a902f272e02b6b4"}, + {file = "pyobjc_framework_vision-11.1-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:d1f8fdccc6135fdbfd66d8f21240d6c84465cb8e116a8e5b43601aed020051e5"}, + {file = "pyobjc_framework_vision-11.1-cp314-cp314t-macosx_11_0_universal2.whl", hash = "sha256:d00830c71a30fc893b3c5ee65119c7e5e5a95a16af53b8e56a0e58cff57e3b56"}, + {file = "pyobjc_framework_vision-11.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:25d2d42edc7459b010ec87a0c5428d12fe5d62dfa95cd34fb71f716f2e4d6b95"}, + {file = "pyobjc_framework_vision-11.1.tar.gz", hash = "sha256:26590512ee7758da3056499062a344b8a351b178be66d4b719327884dde4216b"}, +] + +[package.dependencies] +pyobjc-core = ">=11.1" +pyobjc-framework-Cocoa = ">=11.1" +pyobjc-framework-CoreML = ">=11.1" +pyobjc-framework-Quartz = ">=11.1" + [[package]] name = "pypandoc" version = "1.15" @@ -9000,7 +9498,7 @@ description = "Python bindings to PDFium" optional = true python-versions = ">=3.6" groups = ["main"] -markers = "extra == \"docs\"" +markers = "extra == \"docs\" or extra == \"docling\"" files = [ {file = "pypdfium2-4.30.0-py3-none-macosx_10_13_x86_64.whl", hash = "sha256:b33ceded0b6ff5b2b93bc1fe0ad4b71aa6b7e7bd5875f1ca0cdfb6ba6ac01aab"}, {file = "pypdfium2-4.30.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:4e55689f4b06e2d2406203e771f78789bd4f190731b5d57383d05cf611d829de"}, @@ -9214,7 +9712,7 @@ description = "Create, read, and update Microsoft Word .docx files." optional = true python-versions = ">=3.9" groups = ["main"] -markers = "extra == \"docs\"" +markers = "extra == \"docs\" or extra == \"docling\"" files = [ {file = "python_docx-1.2.0-py3-none-any.whl", hash = "sha256:3fd478f3250fbbbfd3b94fe1e985955737c145627498896a8a6bf81f4baf66c7"}, {file = "python_docx-1.2.0.tar.gz", hash = "sha256:7bc9d7b7d8a69c9c02ca09216118c86552704edc23bac179283f2e38f86220ce"}, @@ -9335,7 +9833,7 @@ description = "Create, read, and update PowerPoint 2007+ (.pptx) files." optional = true python-versions = ">=3.8" groups = ["main"] -markers = "extra == \"docs\"" +markers = "extra == \"docs\" or extra == \"docling\"" files = [ {file = "python_pptx-1.0.2-py3-none-any.whl", hash = "sha256:160838e0b8565a8b1f67947675886e9fea18aa5e795db7ae531606d68e785cba"}, {file = "python_pptx-1.0.2.tar.gz", hash = "sha256:479a8af0eaf0f0d76b6f00b0887732874ad2e3188230315290cd1f9dd9cc7095"}, @@ -9354,7 +9852,7 @@ description = "World timezone definitions, modern and historical" optional = true python-versions = "*" groups = ["main"] -markers = "extra == \"neo4j\" or extra == \"graphiti\" or extra == \"docs\" or extra == \"evals\" or extra == \"dlt\"" +markers = "extra == \"neo4j\" or extra == \"graphiti\" or extra == \"docs\" or extra == \"evals\" or extra == \"docling\" or extra == \"dlt\"" files = [ {file = "pytz-2025.2-py2.py3-none-any.whl", hash = "sha256:5ddf76296dd8c44c26eb8f4b6f35488f3ccbf6fbbd7adee0b7262d43f0ec2f00"}, {file = "pytz-2025.2.tar.gz", hash = "sha256:360b9e3dbb49a209c21ad61809c7fb453643e048b38924c765813546746e81c3"}, @@ -9367,7 +9865,7 @@ description = "Python for Window Extensions" optional = false python-versions = "*" groups = ["main"] -markers = "(platform_system == \"Windows\" or sys_platform == \"win32\") and (platform_system == \"Windows\" or platform_python_implementation != \"PyPy\" or extra == \"dlt\")" +markers = "(platform_system == \"Windows\" or sys_platform == \"win32\") and (platform_system == \"Windows\" or platform_python_implementation != \"PyPy\" or extra == \"dlt\" or extra == \"docling\")" files = [ {file = "pywin32-311-cp310-cp310-win32.whl", hash = "sha256:d03ff496d2a0cd4a5893504789d4a15399133fe82517455e78bad62efbb7f0a3"}, {file = "pywin32-311-cp310-cp310-win_amd64.whl", hash = "sha256:797c2772017851984b97180b0bebe4b620bb86328e8a884bb626156295a63b3b"}, @@ -9700,6 +10198,31 @@ files = [ [package.extras] all = ["numpy"] +[[package]] +name = "rapidocr" +version = "3.4.2" +description = "Awesome OCR Library" +optional = true +python-versions = "<4,>=3.6" +groups = ["main"] +markers = "extra == \"docling\"" +files = [ + {file = "rapidocr-3.4.2-py3-none-any.whl", hash = "sha256:17845fa8cc9a20a935111e59482f2214598bba1547000cfd960d8924dd4522a5"}, +] + +[package.dependencies] +colorlog = "*" +numpy = ">=1.19.5,<3.0.0" +omegaconf = "*" +opencv-python = ">=4.5.1.48" +Pillow = "*" +pyclipper = ">=1.2.0" +PyYAML = "*" +requests = "*" +Shapely = ">=1.7.1,<2.0.4 || >2.0.4" +six = ">=1.15.0" +tqdm = "*" + [[package]] name = "rdflib" version = "7.1.4" @@ -9730,7 +10253,7 @@ description = "Python client for Redis database and key-value store" optional = true python-versions = ">=3.8" groups = ["main"] -markers = "extra == \"falkordb\"" +markers = "extra == \"redis\"" files = [ {file = "redis-5.3.1-py3-none-any.whl", hash = "sha256:dc1909bd24669cc31b5f67a039700b16ec30571096c5f1f0d9d2324bff31af97"}, {file = "redis-5.3.1.tar.gz", hash = "sha256:ca49577a531ea64039b5a36db3d6cd1a0c7a60c34124d46924a45b956e8cf14c"}, @@ -10216,7 +10739,7 @@ description = "Pure-Python RSA implementation" optional = true python-versions = "<4,>=3.6" groups = ["main"] -markers = "extra == \"gemini\" or extra == \"docs\" or extra == \"deepeval\" or extra == \"chromadb\"" +markers = "extra == \"deepeval\" or extra == \"chromadb\" or extra == \"docs\"" files = [ {file = "rsa-4.9.1-py3-none-any.whl", hash = "sha256:68635866661c6836b8d39430f97a996acbd61bfa49406748ea243539fe239762"}, {file = "rsa-4.9.1.tar.gz", hash = "sha256:e7bdbfdb5497da4c07dfd35530e1a902659db6ff241e39d9953cad06ebd0ae75"}, @@ -10225,6 +10748,26 @@ files = [ [package.dependencies] pyasn1 = ">=0.1.3" +[[package]] +name = "rtree" +version = "1.4.1" +description = "R-Tree spatial index for Python GIS" +optional = true +python-versions = ">=3.9" +groups = ["main"] +markers = "extra == \"docling\"" +files = [ + {file = "rtree-1.4.1-py3-none-macosx_10_9_x86_64.whl", hash = "sha256:d672184298527522d4914d8ae53bf76982b86ca420b0acde9298a7a87d81d4a4"}, + {file = "rtree-1.4.1-py3-none-macosx_11_0_arm64.whl", hash = "sha256:a7e48d805e12011c2cf739a29d6a60ae852fb1de9fc84220bbcef67e6e595d7d"}, + {file = "rtree-1.4.1-py3-none-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:efa8c4496e31e9ad58ff6c7df89abceac7022d906cb64a3e18e4fceae6b77f65"}, + {file = "rtree-1.4.1-py3-none-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:12de4578f1b3381a93a655846900be4e3d5f4cd5e306b8b00aa77c1121dc7e8c"}, + {file = "rtree-1.4.1-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:b558edda52eca3e6d1ee629042192c65e6b7f2c150d6d6cd207ce82f85be3967"}, + {file = "rtree-1.4.1-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:f155bc8d6bac9dcd383481dee8c130947a4866db1d16cb6dff442329a038a0dc"}, + {file = "rtree-1.4.1-py3-none-win_amd64.whl", hash = "sha256:efe125f416fd27150197ab8521158662943a40f87acab8028a1aac4ad667a489"}, + {file = "rtree-1.4.1-py3-none-win_arm64.whl", hash = "sha256:3d46f55729b28138e897ffef32f7ce93ac335cb67f9120125ad3742a220800f0"}, + {file = "rtree-1.4.1.tar.gz", hash = "sha256:c6b1b3550881e57ebe530cc6cffefc87cd9bf49c30b37b894065a9f810875e46"}, +] + [[package]] name = "ruff" version = "0.13.1" @@ -10306,7 +10849,7 @@ description = "" optional = true python-versions = ">=3.9" groups = ["main"] -markers = "extra == \"huggingface\" or extra == \"ollama\" or extra == \"codegraph\" or extra == \"docs\"" +markers = "extra == \"huggingface\" or extra == \"ollama\" or extra == \"codegraph\" or extra == \"docling\" or extra == \"docs\"" files = [ {file = "safetensors-0.6.2-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:9c85ede8ec58f120bad982ec47746981e210492a6db876882aa021446af8ffba"}, {file = "safetensors-0.6.2-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:d6675cf4b39c98dbd7d940598028f3742e0375a6b4d4277e76beb0c35f4b843b"}, @@ -10325,6 +10868,10 @@ files = [ {file = "safetensors-0.6.2.tar.gz", hash = "sha256:43ff2aa0e6fa2dc3ea5524ac7ad93a9839256b8703761e76e2d0b2a3fa4f15d9"}, ] +[package.dependencies] +numpy = {version = ">=1.21.6", optional = true, markers = "extra == \"numpy\""} +torch = {version = ">=1.10", optional = true, markers = "extra == \"torch\""} + [package.extras] all = ["safetensors[jax]", "safetensors[numpy]", "safetensors[paddlepaddle]", "safetensors[pinned-tf]", "safetensors[quality]", "safetensors[testing]", "safetensors[torch]"] dev = ["safetensors[all]"] @@ -10403,7 +10950,7 @@ description = "Fundamental algorithms for scientific computing in Python" optional = true python-versions = ">=3.10" groups = ["main"] -markers = "python_version == \"3.10\" and (extra == \"docs\" or extra == \"evals\")" +markers = "python_version == \"3.10\" and (extra == \"docs\" or extra == \"evals\" or extra == \"docling\")" files = [ {file = "scipy-1.15.3-cp310-cp310-macosx_10_13_x86_64.whl", hash = "sha256:a345928c86d535060c9c2b25e71e87c39ab2f22fc96e9636bd74d1dbf9de448c"}, {file = "scipy-1.15.3-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:ad3432cb0f9ed87477a8d97f03b763fd1d57709f1bbde3c9369b1dff5503b253"}, @@ -10468,7 +11015,7 @@ description = "Fundamental algorithms for scientific computing in Python" optional = true python-versions = ">=3.11" groups = ["main"] -markers = "python_version >= \"3.11\" and (extra == \"docs\" or extra == \"evals\")" +markers = "python_version >= \"3.11\" and (extra == \"docs\" or extra == \"evals\" or extra == \"docling\")" files = [ {file = "scipy-1.16.2-cp311-cp311-macosx_10_14_x86_64.whl", hash = "sha256:6ab88ea43a57da1af33292ebd04b417e8e2eaf9d5aa05700be8d6e1b6501cd92"}, {file = "scipy-1.16.2-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:c95e96c7305c96ede73a7389f46ccd6c659c4da5ef1b2789466baeaed3622b6e"}, @@ -10541,6 +11088,23 @@ dev = ["cython-lint (>=0.12.2)", "doit (>=0.36.0)", "mypy (==1.10.0)", "pycodest doc = ["intersphinx_registry", "jupyterlite-pyodide-kernel", "jupyterlite-sphinx (>=0.19.1)", "jupytext", "linkify-it-py", "matplotlib (>=3.5)", "myst-nb (>=1.2.0)", "numpydoc", "pooch", "pydata-sphinx-theme (>=0.15.2)", "sphinx (>=5.0.0,<8.2.0)", "sphinx-copybutton", "sphinx-design (>=0.4.0)"] test = ["Cython", "array-api-strict (>=2.3.1)", "asv", "gmpy2", "hypothesis (>=6.30)", "meson", "mpmath", "ninja ; sys_platform != \"emscripten\"", "pooch", "pytest (>=8.0.0)", "pytest-cov", "pytest-timeout", "pytest-xdist", "scikit-umfpack", "threadpoolctl"] +[[package]] +name = "semchunk" +version = "2.2.2" +description = "A fast and lightweight Python library for splitting text into semantically meaningful chunks." +optional = true +python-versions = ">=3.9" +groups = ["main"] +markers = "extra == \"docling\"" +files = [ + {file = "semchunk-2.2.2-py3-none-any.whl", hash = "sha256:94ca19020c013c073abdfd06d79a7c13637b91738335f3b8cdb5655ee7cc94d2"}, + {file = "semchunk-2.2.2.tar.gz", hash = "sha256:940e89896e64eeb01de97ba60f51c8c7b96c6a3951dfcf574f25ce2146752f52"}, +] + +[package.dependencies] +mpire = {version = "*", extras = ["dill"]} +tqdm = "*" + [[package]] name = "semver" version = "3.0.4" @@ -10639,7 +11203,7 @@ description = "Easily download, build, install, upgrade, and uninstall Python pa optional = true python-versions = ">=3.9" groups = ["main"] -markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and (extra == \"docs\" or extra == \"notebook\" or extra == \"dev\" or extra == \"llama-index\" or extra == \"deepeval\" or extra == \"dlt\") or python_version == \"3.12\" and (extra == \"notebook\" or extra == \"dev\" or extra == \"llama-index\" or extra == \"deepeval\" or extra == \"dlt\" or extra == \"docs\") or python_full_version == \"3.13.0\" and (extra == \"notebook\" or extra == \"dev\" or extra == \"llama-index\" or extra == \"deepeval\" or extra == \"dlt\" or extra == \"docs\") or extra == \"notebook\" or extra == \"dev\" or extra == \"llama-index\" or extra == \"deepeval\" or extra == \"dlt\"" +markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and (extra == \"docs\" or extra == \"docling\" or extra == \"notebook\" or extra == \"dev\" or extra == \"llama-index\" or extra == \"deepeval\" or extra == \"dlt\") or python_version == \"3.12\" and (extra == \"notebook\" or extra == \"dev\" or extra == \"llama-index\" or extra == \"deepeval\" or extra == \"dlt\" or extra == \"docs\" or extra == \"docling\") or python_full_version == \"3.13.0\" and (extra == \"notebook\" or extra == \"dev\" or extra == \"llama-index\" or extra == \"deepeval\" or extra == \"dlt\" or extra == \"docs\" or extra == \"docling\") or extra == \"notebook\" or extra == \"dev\" or extra == \"llama-index\" or extra == \"deepeval\" or extra == \"dlt\"" files = [ {file = "setuptools-80.9.0-py3-none-any.whl", hash = "sha256:062d34222ad13e0cc312a4c02d73f059e86a4acbfbdea8f8f76b28c99f306922"}, {file = "setuptools-80.9.0.tar.gz", hash = "sha256:f36b47402ecde768dbfafc46e8e4207b4360c654f1f3bb84475f0a28628fb19c"}, @@ -10654,6 +11218,81 @@ enabler = ["pytest-enabler (>=2.2)"] test = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21) ; python_version >= \"3.9\" and sys_platform != \"cygwin\"", "jaraco.envs (>=2.2)", "jaraco.path (>=3.7.2)", "jaraco.test (>=5.5)", "packaging (>=24.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.*)", "pytest-home (>=0.5)", "pytest-perf ; sys_platform != \"cygwin\"", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel (>=0.44.0)"] type = ["importlib_metadata (>=7.0.2) ; python_version < \"3.10\"", "jaraco.develop (>=7.21) ; sys_platform != \"cygwin\"", "mypy (==1.14.*)", "pytest-mypy"] +[[package]] +name = "shapely" +version = "2.1.2" +description = "Manipulation and analysis of geometric objects" +optional = true +python-versions = ">=3.10" +groups = ["main"] +markers = "extra == \"docling\"" +files = [ + {file = "shapely-2.1.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:7ae48c236c0324b4e139bea88a306a04ca630f49be66741b340729d380d8f52f"}, + {file = "shapely-2.1.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:eba6710407f1daa8e7602c347dfc94adc02205ec27ed956346190d66579eb9ea"}, + {file = "shapely-2.1.2-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ef4a456cc8b7b3d50ccec29642aa4aeda959e9da2fe9540a92754770d5f0cf1f"}, + {file = "shapely-2.1.2-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:e38a190442aacc67ff9f75ce60aec04893041f16f97d242209106d502486a142"}, + {file = "shapely-2.1.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:40d784101f5d06a1fd30b55fc11ea58a61be23f930d934d86f19a180909908a4"}, + {file = "shapely-2.1.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:f6f6cd5819c50d9bcf921882784586aab34a4bd53e7553e175dece6db513a6f0"}, + {file = "shapely-2.1.2-cp310-cp310-win32.whl", hash = "sha256:fe9627c39c59e553c90f5bc3128252cb85dc3b3be8189710666d2f8bc3a5503e"}, + {file = "shapely-2.1.2-cp310-cp310-win_amd64.whl", hash = "sha256:1d0bfb4b8f661b3b4ec3565fa36c340bfb1cda82087199711f86a88647d26b2f"}, + {file = "shapely-2.1.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:91121757b0a36c9aac3427a651a7e6567110a4a67c97edf04f8d55d4765f6618"}, + {file = "shapely-2.1.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:16a9c722ba774cf50b5d4541242b4cce05aafd44a015290c82ba8a16931ff63d"}, + {file = "shapely-2.1.2-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cc4f7397459b12c0b196c9efe1f9d7e92463cbba142632b4cc6d8bbbbd3e2b09"}, + {file = "shapely-2.1.2-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:136ab87b17e733e22f0961504d05e77e7be8c9b5a8184f685b4a91a84efe3c26"}, + {file = "shapely-2.1.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:16c5d0fc45d3aa0a69074979f4f1928ca2734fb2e0dde8af9611e134e46774e7"}, + {file = "shapely-2.1.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:6ddc759f72b5b2b0f54a7e7cde44acef680a55019eb52ac63a7af2cf17cb9cd2"}, + {file = "shapely-2.1.2-cp311-cp311-win32.whl", hash = "sha256:2fa78b49485391224755a856ed3b3bd91c8455f6121fee0db0e71cefb07d0ef6"}, + {file = "shapely-2.1.2-cp311-cp311-win_amd64.whl", hash = "sha256:c64d5c97b2f47e3cd9b712eaced3b061f2b71234b3fc263e0fcf7d889c6559dc"}, + {file = "shapely-2.1.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:fe2533caae6a91a543dec62e8360fe86ffcdc42a7c55f9dfd0128a977a896b94"}, + {file = "shapely-2.1.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ba4d1333cc0bc94381d6d4308d2e4e008e0bd128bdcff5573199742ee3634359"}, + {file = "shapely-2.1.2-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:0bd308103340030feef6c111d3eb98d50dc13feea33affc8a6f9fa549e9458a3"}, + {file = "shapely-2.1.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1e7d4d7ad262a48bb44277ca12c7c78cb1b0f56b32c10734ec9a1d30c0b0c54b"}, + {file = "shapely-2.1.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:e9eddfe513096a71896441a7c37db72da0687b34752c4e193577a145c71736fc"}, + {file = "shapely-2.1.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:980c777c612514c0cf99bc8a9de6d286f5e186dcaf9091252fcd444e5638193d"}, + {file = "shapely-2.1.2-cp312-cp312-win32.whl", hash = "sha256:9111274b88e4d7b54a95218e243282709b330ef52b7b86bc6aaf4f805306f454"}, + {file = "shapely-2.1.2-cp312-cp312-win_amd64.whl", hash = "sha256:743044b4cfb34f9a67205cee9279feaf60ba7d02e69febc2afc609047cb49179"}, + {file = "shapely-2.1.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:b510dda1a3672d6879beb319bc7c5fd302c6c354584690973c838f46ec3e0fa8"}, + {file = "shapely-2.1.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:8cff473e81017594d20ec55d86b54bc635544897e13a7cfc12e36909c5309a2a"}, + {file = "shapely-2.1.2-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:fe7b77dc63d707c09726b7908f575fc04ff1d1ad0f3fb92aec212396bc6cfe5e"}, + {file = "shapely-2.1.2-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:7ed1a5bbfb386ee8332713bf7508bc24e32d24b74fc9a7b9f8529a55db9f4ee6"}, + {file = "shapely-2.1.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:a84e0582858d841d54355246ddfcbd1fce3179f185da7470f41ce39d001ee1af"}, + {file = "shapely-2.1.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:dc3487447a43d42adcdf52d7ac73804f2312cbfa5d433a7d2c506dcab0033dfd"}, + {file = "shapely-2.1.2-cp313-cp313-win32.whl", hash = "sha256:9c3a3c648aedc9f99c09263b39f2d8252f199cb3ac154fadc173283d7d111350"}, + {file = "shapely-2.1.2-cp313-cp313-win_amd64.whl", hash = "sha256:ca2591bff6645c216695bdf1614fca9c82ea1144d4a7591a466fef64f28f0715"}, + {file = "shapely-2.1.2-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:2d93d23bdd2ed9dc157b46bc2f19b7da143ca8714464249bef6771c679d5ff40"}, + {file = "shapely-2.1.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:01d0d304b25634d60bd7cf291828119ab55a3bab87dc4af1e44b07fb225f188b"}, + {file = "shapely-2.1.2-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:8d8382dd120d64b03698b7298b89611a6ea6f55ada9d39942838b79c9bc89801"}, + {file = "shapely-2.1.2-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:19efa3611eef966e776183e338b2d7ea43569ae99ab34f8d17c2c054d3205cc0"}, + {file = "shapely-2.1.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:346ec0c1a0fcd32f57f00e4134d1200e14bf3f5ae12af87ba83ca275c502498c"}, + {file = "shapely-2.1.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:6305993a35989391bd3476ee538a5c9a845861462327efe00dd11a5c8c709a99"}, + {file = "shapely-2.1.2-cp313-cp313t-win32.whl", hash = "sha256:c8876673449f3401f278c86eb33224c5764582f72b653a415d0e6672fde887bf"}, + {file = "shapely-2.1.2-cp313-cp313t-win_amd64.whl", hash = "sha256:4a44bc62a10d84c11a7a3d7c1c4fe857f7477c3506e24c9062da0db0ae0c449c"}, + {file = "shapely-2.1.2-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:9a522f460d28e2bf4e12396240a5fc1518788b2fcd73535166d748399ef0c223"}, + {file = "shapely-2.1.2-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:1ff629e00818033b8d71139565527ced7d776c269a49bd78c9df84e8f852190c"}, + {file = "shapely-2.1.2-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:f67b34271dedc3c653eba4e3d7111aa421d5be9b4c4c7d38d30907f796cb30df"}, + {file = "shapely-2.1.2-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:21952dc00df38a2c28375659b07a3979d22641aeb104751e769c3ee825aadecf"}, + {file = "shapely-2.1.2-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:1f2f33f486777456586948e333a56ae21f35ae273be99255a191f5c1fa302eb4"}, + {file = "shapely-2.1.2-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:cf831a13e0d5a7eb519e96f58ec26e049b1fad411fc6fc23b162a7ce04d9cffc"}, + {file = "shapely-2.1.2-cp314-cp314-win32.whl", hash = "sha256:61edcd8d0d17dd99075d320a1dd39c0cb9616f7572f10ef91b4b5b00c4aeb566"}, + {file = "shapely-2.1.2-cp314-cp314-win_amd64.whl", hash = "sha256:a444e7afccdb0999e203b976adb37ea633725333e5b119ad40b1ca291ecf311c"}, + {file = "shapely-2.1.2-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:5ebe3f84c6112ad3d4632b1fd2290665aa75d4cef5f6c5d77c4c95b324527c6a"}, + {file = "shapely-2.1.2-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:5860eb9f00a1d49ebb14e881f5caf6c2cf472c7fd38bd7f253bbd34f934eb076"}, + {file = "shapely-2.1.2-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:b705c99c76695702656327b819c9660768ec33f5ce01fa32b2af62b56ba400a1"}, + {file = "shapely-2.1.2-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:a1fd0ea855b2cf7c9cddaf25543e914dd75af9de08785f20ca3085f2c9ca60b0"}, + {file = "shapely-2.1.2-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:df90e2db118c3671a0754f38e36802db75fe0920d211a27481daf50a711fdf26"}, + {file = "shapely-2.1.2-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:361b6d45030b4ac64ddd0a26046906c8202eb60d0f9f53085f5179f1d23021a0"}, + {file = "shapely-2.1.2-cp314-cp314t-win32.whl", hash = "sha256:b54df60f1fbdecc8ebc2c5b11870461a6417b3d617f555e5033f1505d36e5735"}, + {file = "shapely-2.1.2-cp314-cp314t-win_amd64.whl", hash = "sha256:0036ac886e0923417932c2e6369b6c52e38e0ff5d9120b90eef5cd9a5fc5cae9"}, + {file = "shapely-2.1.2.tar.gz", hash = "sha256:2ed4ecb28320a433db18a5bf029986aa8afcfd740745e78847e330d5d94922a9"}, +] + +[package.dependencies] +numpy = ">=1.21" + +[package.extras] +docs = ["matplotlib", "numpydoc (==1.1.*)", "sphinx", "sphinx-book-theme", "sphinx-remove-toctrees"] +test = ["pytest", "pytest-cov", "scipy-doctest"] + [[package]] name = "shellingham" version = "1.5.4" @@ -10851,7 +11490,7 @@ description = "A modern CSS selector implementation for Beautiful Soup." optional = true python-versions = ">=3.9" groups = ["main"] -markers = "extra == \"notebook\" or extra == \"dev\" or extra == \"docs\" or extra == \"evals\"" +markers = "extra == \"scraping\" or extra == \"notebook\" or extra == \"dev\" or extra == \"docs\" or extra == \"evals\" or extra == \"docling\"" files = [ {file = "soupsieve-2.8-py3-none-any.whl", hash = "sha256:0cc76456a30e20f5d7f2e14a98a4ae2ee4e5abdc7c5ea0aafe795f344bc7984c"}, {file = "soupsieve-2.8.tar.gz", hash = "sha256:e2dd4a40a628cb5f28f6d4b0db8800b8f581b65bb380b97de22ba5ca8d72572f"}, @@ -11067,7 +11706,7 @@ description = "Pretty-print tabular data" optional = true python-versions = ">=3.7" groups = ["main"] -markers = "extra == \"deepeval\"" +markers = "extra == \"deepeval\" or extra == \"docling\"" files = [ {file = "tabulate-0.9.0-py3-none-any.whl", hash = "sha256:024ca478df22e9340661486f85298cff5f6dcdba14f3813e8830015b9ed1948f"}, {file = "tabulate-0.9.0.tar.gz", hash = "sha256:0095b12bf5966de529c0feb1fa08671671b3368eec77d7ef7ab114be2c068b3c"}, @@ -11076,6 +11715,24 @@ files = [ [package.extras] widechars = ["wcwidth"] +[[package]] +name = "tavily-python" +version = "0.7.12" +description = "Python wrapper for the Tavily API" +optional = true +python-versions = ">=3.6" +groups = ["main"] +markers = "extra == \"scraping\"" +files = [ + {file = "tavily_python-0.7.12-py3-none-any.whl", hash = "sha256:00d09b9de3ca02ef9a994cf4e7ae43d4ec9d199f0566ba6e52cbfcbd07349bd1"}, + {file = "tavily_python-0.7.12.tar.gz", hash = "sha256:661945bbc9284cdfbe70fb50de3951fd656bfd72e38e352481d333a36ae91f5a"}, +] + +[package.dependencies] +httpx = "*" +requests = "*" +tiktoken = ">=0.5.1" + [[package]] name = "tenacity" version = "9.0.0" @@ -11325,7 +11982,7 @@ description = "Tensors and Dynamic neural networks in Python with strong GPU acc optional = true python-versions = ">=3.9.0" groups = ["main"] -markers = "extra == \"docs\"" +markers = "extra == \"docs\" or extra == \"docling\"" files = [ {file = "torch-2.8.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:0be92c08b44009d4131d1ff7a8060d10bafdb7ddcb7359ef8d8c5169007ea905"}, {file = "torch-2.8.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:89aa9ee820bb39d4d72b794345cccef106b574508dd17dbec457949678c76011"}, @@ -11389,7 +12046,7 @@ description = "image and video datasets and models for torch deep learning" optional = true python-versions = ">=3.9" groups = ["main"] -markers = "extra == \"docs\"" +markers = "extra == \"docs\" or extra == \"docling\"" files = [ {file = "torchvision-0.23.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:7266871daca00ad46d1c073e55d972179d12a58fa5c9adec9a3db9bbed71284a"}, {file = "torchvision-0.23.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:31c583ba27426a3a04eca8c05450524105c1564db41be6632f7536ef405a6de2"}, @@ -11494,7 +12151,7 @@ description = "State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow optional = true python-versions = ">=3.9.0" groups = ["main"] -markers = "extra == \"huggingface\" or extra == \"ollama\" or extra == \"codegraph\" or extra == \"docs\"" +markers = "extra == \"huggingface\" or extra == \"ollama\" or extra == \"codegraph\" or extra == \"docling\" or extra == \"docs\"" files = [ {file = "transformers-4.56.2-py3-none-any.whl", hash = "sha256:79c03d0e85b26cb573c109ff9eafa96f3c8d4febfd8a0774e8bba32702dd6dde"}, {file = "transformers-4.56.2.tar.gz", hash = "sha256:5e7c623e2d7494105c726dd10f6f90c2c99a55ebe86eef7233765abd0cb1c529"}, @@ -11636,7 +12293,7 @@ description = "A language and compiler for custom Deep Learning operations" optional = true python-versions = "<3.14,>=3.9" groups = ["main"] -markers = "platform_machine == \"x86_64\" and extra == \"docs\" and platform_system == \"Linux\"" +markers = "platform_machine == \"x86_64\" and (extra == \"docs\" or extra == \"docling\") and platform_system == \"Linux\"" files = [ {file = "triton-3.4.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7ff2785de9bc02f500e085420273bb5cc9c9bb767584a4aa28d6e360cec70128"}, {file = "triton-3.4.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7b70f5e6a41e52e48cfc087436c8a28c17ff98db369447bcaff3b887a3ab4467"}, @@ -11785,12 +12442,31 @@ description = "Provider of IANA time zone data" optional = true python-versions = ">=2" groups = ["main"] -markers = "extra == \"docs\" or extra == \"evals\" or extra == \"dlt\"" +markers = "(platform_system == \"Windows\" or extra == \"docs\" or extra == \"evals\" or extra == \"docling\" or extra == \"dlt\") and (extra == \"scraping\" or extra == \"docs\" or extra == \"evals\" or extra == \"docling\" or extra == \"dlt\")" files = [ {file = "tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8"}, {file = "tzdata-2025.2.tar.gz", hash = "sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9"}, ] +[[package]] +name = "tzlocal" +version = "5.3.1" +description = "tzinfo object for the local timezone" +optional = true +python-versions = ">=3.9" +groups = ["main"] +markers = "extra == \"scraping\"" +files = [ + {file = "tzlocal-5.3.1-py3-none-any.whl", hash = "sha256:eb1a66c3ef5847adf7a834f1be0800581b683b5608e74f86ecbcef8ab91bb85d"}, + {file = "tzlocal-5.3.1.tar.gz", hash = "sha256:cceffc7edecefea1f595541dbd6e990cb1ea3d19bf01b2809f362a03dd7921fd"}, +] + +[package.dependencies] +tzdata = {version = "*", markers = "platform_system == \"Windows\""} + +[package.extras] +devenv = ["check-manifest", "pytest (>=4.3)", "pytest-cov", "pytest-mock (>=3.3)", "zest.releaser"] + [[package]] name = "unstructured" version = "0.18.15" @@ -11970,19 +12646,6 @@ files = [ [package.extras] dev = ["flake8", "flake8-annotations", "flake8-bandit", "flake8-bugbear", "flake8-commas", "flake8-comprehensions", "flake8-continuation", "flake8-datetimez", "flake8-docstrings", "flake8-import-order", "flake8-literal", "flake8-modern-annotations", "flake8-noqa", "flake8-pyproject", "flake8-requirements", "flake8-typechecking-import", "flake8-use-fstring", "mypy", "pep8-naming", "types-PyYAML"] -[[package]] -name = "uritemplate" -version = "4.2.0" -description = "Implementation of RFC 6570 URI Templates" -optional = true -python-versions = ">=3.9" -groups = ["main"] -markers = "extra == \"gemini\"" -files = [ - {file = "uritemplate-4.2.0-py3-none-any.whl", hash = "sha256:962201ba1c4edcab02e60f9a0d3821e82dfc5d2d6662a21abd533879bdb8a686"}, - {file = "uritemplate-4.2.0.tar.gz", hash = "sha256:480c2ed180878955863323eea31b0ede668795de182617fef9c6ca09e6ec9d0e"}, -] - [[package]] name = "urllib3" version = "2.5.0" @@ -12576,7 +13239,7 @@ description = "A Python module for creating Excel XLSX files." optional = true python-versions = ">=3.8" groups = ["main"] -markers = "extra == \"docs\"" +markers = "extra == \"docs\" or extra == \"docling\"" files = [ {file = "xlsxwriter-3.2.9-py3-none-any.whl", hash = "sha256:9a5db42bc5dff014806c58a20b9eae7322a134abb6fce3c92c181bfb275ec5b3"}, {file = "xlsxwriter-3.2.9.tar.gz", hash = "sha256:254b1c37a368c444eac6e2f867405cc9e461b0ed97a3233b2ac1e574efb4140c"}, @@ -12846,10 +13509,9 @@ deepeval = ["deepeval"] dev = ["coverage", "deptry", "gitpython", "mkdocs-material", "mkdocs-minify-plugin", "mkdocstrings", "mypy", "notebook", "pre-commit", "pylint", "pytest", "pytest-asyncio", "pytest-cov", "ruff", "tweepy"] distributed = ["modal"] dlt = ["dlt"] +docling = ["docling", "transformers"] docs = ["unstructured"] evals = ["gdown", "matplotlib", "pandas", "plotly", "scikit-learn"] -falkordb = ["falkordb"] -gemini = ["google-generativeai"] graphiti = ["graphiti-core"] groq = ["groq"] huggingface = ["transformers"] @@ -12864,8 +13526,10 @@ ollama = ["transformers"] postgres = ["asyncpg", "pgvector", "psycopg2"] postgres-binary = ["asyncpg", "pgvector", "psycopg2-binary"] posthog = ["posthog"] +redis = ["redis"] +scraping = ["APScheduler", "beautifulsoup4", "lxml", "playwright", "protego", "tavily-python"] [metadata] lock-version = "2.1" python-versions = ">=3.10,<=3.13" -content-hash = "f56ce018c96211b8a67d74b4b53c3d333dd6aa964d4be4f9844db8710d130144" +content-hash = "8d8172ac8ddc3c30ca79a1677ecf2a28897d52c0a564d8fb5646c8565c313a0f" diff --git a/pyproject.toml b/pyproject.toml index c8f71514b..30889a61e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -140,6 +140,7 @@ dev = [ "mkdocstrings[python]>=0.26.2,<0.27", ] debug = ["debugpy>=1.8.9,<2.0.0"] +redis = ["redis>=5.0.3,<6.0.0"] monitoring = ["sentry-sdk[fastapi]>=2.9.0,<3", "langfuse>=2.32.0,<3"] diff --git a/uv.lock b/uv.lock index 8bab2a8e5..4f94bf098 100644 --- a/uv.lock +++ b/uv.lock @@ -1024,6 +1024,9 @@ postgres-binary = [ posthog = [ { name = "posthog" }, ] +redis = [ + { name = "redis" }, +] scraping = [ { name = "apscheduler" }, { name = "beautifulsoup4" }, @@ -1114,6 +1117,7 @@ requires-dist = [ { name = "python-magic-bin", marker = "sys_platform == 'win32'", specifier = "<0.5" }, { name = "python-multipart", specifier = ">=0.0.20,<1.0.0" }, { name = "rdflib", specifier = ">=7.1.4,<7.2.0" }, + { name = "redis", marker = "extra == 'redis'", specifier = ">=5.0.3,<6.0.0" }, { name = "ruff", marker = "extra == 'dev'", specifier = ">=0.9.2,<=0.13.1" }, { name = "s3fs", extras = ["boto3"], marker = "extra == 'aws'", specifier = "==2025.3.2" }, { name = "scikit-learn", marker = "extra == 'evals'", specifier = ">=1.6.1,<2" }, @@ -1134,7 +1138,7 @@ requires-dist = [ { name = "uvicorn", specifier = ">=0.34.0,<1.0.0" }, { name = "websockets", specifier = ">=15.0.1,<16.0.0" }, ] -provides-extras = ["api", "distributed", "scraping", "neo4j", "neptune", "postgres", "postgres-binary", "notebook", "langchain", "llama-index", "huggingface", "ollama", "mistral", "anthropic", "deepeval", "posthog", "groq", "chromadb", "docs", "codegraph", "evals", "graphiti", "aws", "dlt", "baml", "dev", "debug", "monitoring", "docling"] +provides-extras = ["api", "distributed", "scraping", "neo4j", "neptune", "postgres", "postgres-binary", "notebook", "langchain", "llama-index", "huggingface", "ollama", "mistral", "anthropic", "deepeval", "posthog", "groq", "chromadb", "docs", "codegraph", "evals", "graphiti", "aws", "dlt", "baml", "dev", "debug", "redis", "monitoring", "docling"] [[package]] name = "colorama" @@ -7389,6 +7393,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f4/31/e9b6f04288dcd3fa60cb3179260d6dad81b92aef3063d679ac7d80a827ea/rdflib-7.1.4-py3-none-any.whl", hash = "sha256:72f4adb1990fa5241abd22ddaf36d7cafa5d91d9ff2ba13f3086d339b213d997", size = 565051, upload-time = "2025-03-29T02:22:44.987Z" }, ] +[[package]] +name = "redis" +version = "5.3.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "async-timeout", marker = "python_full_version < '3.11.3'" }, + { name = "pyjwt" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/6a/cf/128b1b6d7086200c9f387bd4be9b2572a30b90745ef078bd8b235042dc9f/redis-5.3.1.tar.gz", hash = "sha256:ca49577a531ea64039b5a36db3d6cd1a0c7a60c34124d46924a45b956e8cf14c", size = 4626200, upload-time = "2025-07-25T08:06:27.778Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7f/26/5c5fa0e83c3621db835cfc1f1d789b37e7fa99ed54423b5f519beb931aa7/redis-5.3.1-py3-none-any.whl", hash = "sha256:dc1909bd24669cc31b5f67a039700b16ec30571096c5f1f0d9d2324bff31af97", size = 272833, upload-time = "2025-07-25T08:06:26.317Z" }, +] + [[package]] name = "referencing" version = "0.36.2" diff --git a/working_dir_error_replication/run_subprocess_test.py b/working_dir_error_replication/run_subprocess_test.py new file mode 100644 index 000000000..b97154a91 --- /dev/null +++ b/working_dir_error_replication/run_subprocess_test.py @@ -0,0 +1,31 @@ +""" +Run writer and reader in separate subprocesses to test Kuzu locks. +""" + +import subprocess +import time +import os + + +def main(): + print("=== Kuzu Subprocess Lock Test ===") + print("Starting writer and reader in separate subprocesses...") + print("Writer will hold the database lock, reader should block or fail\n") + + start_time = time.time() + + # Start writer subprocess + writer_process = subprocess.Popen([os.sys.executable, "writer.py"]) + + reader_process = subprocess.Popen([os.sys.executable, "reader.py"]) + + # Wait for both processes to complete + writer_process.wait() + reader_process.wait() + + total_time = time.time() - start_time + print(f"\nTotal execution time: {total_time:.2f}s") + + +if __name__ == "__main__": + main() From 6a693d319add7eaf7875599e2f965c0da5d43096 Mon Sep 17 00:00:00 2001 From: Daulet Amirkhanov Date: Thu, 16 Oct 2025 15:45:21 +0100 Subject: [PATCH 40/61] fix: preferred_loaders is always None in `data_item_to_text_file.load_file()` --- .../tasks/ingestion/data_item_to_text_file.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/cognee/tasks/ingestion/data_item_to_text_file.py b/cognee/tasks/ingestion/data_item_to_text_file.py index 9fcafca57..dc0d1d0a7 100644 --- a/cognee/tasks/ingestion/data_item_to_text_file.py +++ b/cognee/tasks/ingestion/data_item_to_text_file.py @@ -48,17 +48,17 @@ async def data_item_to_text_file( await pull_from_s3(data_item_path, temp_file) temp_file.flush() # Data needs to be saved to local storage loader = get_loader_engine() - return await loader.load_file(temp_file.name, preferred_loaders), loader.get_loader( - temp_file.name, preferred_loaders - ) + return await loader.load_file( + temp_file.name, None, preferred_loaders + ), loader.get_loader(temp_file.name, preferred_loaders) # data is local file path elif parsed_url.scheme == "file": if settings.accept_local_file_path: loader = get_loader_engine() - return await loader.load_file(data_item_path, preferred_loaders), loader.get_loader( - data_item_path, preferred_loaders - ) + return await loader.load_file( + data_item_path, None, preferred_loaders + ), loader.get_loader(data_item_path, preferred_loaders) else: raise IngestionError(message="Local files are not accepted.") @@ -69,9 +69,9 @@ async def data_item_to_text_file( # Handle both Unix absolute paths (/path) and Windows absolute paths (C:\path) if settings.accept_local_file_path: loader = get_loader_engine() - return await loader.load_file(data_item_path, preferred_loaders), loader.get_loader( - data_item_path, preferred_loaders - ) + return await loader.load_file( + data_item_path, None, preferred_loaders + ), loader.get_loader(data_item_path, preferred_loaders) else: raise IngestionError(message="Local files are not accepted.") From 2998802c00961e36115bed93f5eda446e8500c75 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Fri, 17 Oct 2025 11:58:14 +0200 Subject: [PATCH 41/61] fix: Resolve issue with wrong error for OpenAI --- .../litellm_instructor/llm/openai/adapter.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/adapter.py b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/adapter.py index 8877c2bdf..305b426b8 100644 --- a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/adapter.py +++ b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/adapter.py @@ -156,10 +156,7 @@ class OpenAIAdapter(LLMInterface): InstructorRetryException, ) as e: if not (self.fallback_model and self.fallback_api_key): - raise ContentPolicyFilterError( - f"The provided input contains content that is not aligned with our content policy: {text_input}" - ) from e - + raise e try: return await self.aclient.chat.completions.create( model=self.fallback_model, From 3ee50c192f0b3469858e5caf4992e8cfd8901d36 Mon Sep 17 00:00:00 2001 From: Daulet Amirkhanov Date: Fri, 17 Oct 2025 12:01:06 +0100 Subject: [PATCH 42/61] refactor emptiness check to be boolean, and optimize query --- cognee/api/v1/search/search.py | 10 ++++------ .../databases/graph/graph_db_interface.py | 6 +++--- .../databases/graph/kuzu/adapter.py | 7 ++++--- .../databases/graph/neo4j_driver/adapter.py | 7 ++++--- cognee/modules/data/exceptions/__init__.py | 1 - cognee/modules/data/exceptions/exceptions.py | 10 ---------- cognee/tests/test_kuzu.py | 16 ++++++++-------- cognee/tests/test_neo4j.py | 16 ++++++++-------- 8 files changed, 31 insertions(+), 42 deletions(-) diff --git a/cognee/api/v1/search/search.py b/cognee/api/v1/search/search.py index 880a57b99..0caca619a 100644 --- a/cognee/api/v1/search/search.py +++ b/cognee/api/v1/search/search.py @@ -8,7 +8,7 @@ from cognee.modules.search.types import SearchResult, SearchType, CombinedSearch from cognee.modules.users.methods import get_default_user from cognee.modules.search.methods import search as search_function from cognee.modules.data.methods import get_authorized_existing_datasets -from cognee.modules.data.exceptions import DatasetNotFoundError, SearchOnEmptyGraphError +from cognee.modules.data.exceptions import DatasetNotFoundError async def search( @@ -177,12 +177,10 @@ async def search( raise DatasetNotFoundError(message="No datasets found.") graph_engine = await get_graph_engine() - nodes_count = await graph_engine.count_nodes() + is_empty = await graph_engine.is_empty() - if nodes_count == 0: - raise SearchOnEmptyGraphError( - message="Knowledge graph is empty, please ensure data is added and cognified." - ) + if is_empty: + return [] filtered_search_results = await search_function( query_text=query_text, diff --git a/cognee/infrastructure/databases/graph/graph_db_interface.py b/cognee/infrastructure/databases/graph/graph_db_interface.py index d7542eac6..67df1a27c 100644 --- a/cognee/infrastructure/databases/graph/graph_db_interface.py +++ b/cognee/infrastructure/databases/graph/graph_db_interface.py @@ -160,9 +160,9 @@ class GraphDBInterface(ABC): """ @abstractmethod - async def count_nodes(self) -> int: - logger.warning("count_nodes is not implemented") - return 1 # dummy value to not fail search() + async def is_empty(self) -> bool: + logger.warning("is_empty() is not implemented") + return True @abstractmethod async def query(self, query: str, params: dict) -> List[Any]: diff --git a/cognee/infrastructure/databases/graph/kuzu/adapter.py b/cognee/infrastructure/databases/graph/kuzu/adapter.py index 04c163efa..29ff92247 100644 --- a/cognee/infrastructure/databases/graph/kuzu/adapter.py +++ b/cognee/infrastructure/databases/graph/kuzu/adapter.py @@ -185,13 +185,14 @@ class KuzuAdapter(GraphDBInterface): except FileNotFoundError: logger.warning(f"Kuzu S3 storage file not found: {self.db_path}") - async def count_nodes(self) -> int: + async def is_empty(self) -> bool: query = """ MATCH (n) - RETURN COUNT(n); + RETURN true + LIMIT 1; """ query_result = await self.query(query) - return query_result[0][0] + return len(query_result) == 0 async def query(self, query: str, params: Optional[dict] = None) -> List[Tuple]: """ diff --git a/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py b/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py index ac19069f4..5861b69cb 100644 --- a/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py +++ b/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py @@ -87,13 +87,14 @@ class Neo4jAdapter(GraphDBInterface): async with self.driver.session(database=self.graph_database_name) as session: yield session - async def count_nodes(self) -> int: + async def is_empty(self) -> bool: query = """ + RETURN EXISTS { MATCH (n) - RETURN COUNT(n) as total_nodes; + } AS node_exists; """ query_result = await self.query(query) - return query_result[0]["total_nodes"] + return not query_result[0]["node_exists"] @deadlock_retry() async def query( diff --git a/cognee/modules/data/exceptions/__init__.py b/cognee/modules/data/exceptions/__init__.py index ba943634d..54af81070 100644 --- a/cognee/modules/data/exceptions/__init__.py +++ b/cognee/modules/data/exceptions/__init__.py @@ -9,5 +9,4 @@ from .exceptions import ( UnauthorizedDataAccessError, DatasetNotFoundError, DatasetTypeError, - SearchOnEmptyGraphError, ) diff --git a/cognee/modules/data/exceptions/exceptions.py b/cognee/modules/data/exceptions/exceptions.py index c2921750a..ac3b68e64 100644 --- a/cognee/modules/data/exceptions/exceptions.py +++ b/cognee/modules/data/exceptions/exceptions.py @@ -35,16 +35,6 @@ class DatasetNotFoundError(CogneeValidationError): super().__init__(message, name, status_code) -class SearchOnEmptyGraphError(CogneeValidationError): - def __init__( - self, - message: str = "Knowledge graph is empty, please ensure data is added and cognified.", - name: str = "SearchOnEmptyGraphError", - status_code=status.HTTP_400_BAD_REQUEST, - ): - super().__init__(message, name, status_code) - - class DatasetTypeError(CogneeValidationError): def __init__( self, diff --git a/cognee/tests/test_kuzu.py b/cognee/tests/test_kuzu.py index c07a51104..fe9da6dcb 100644 --- a/cognee/tests/test_kuzu.py +++ b/cognee/tests/test_kuzu.py @@ -51,21 +51,21 @@ async def main(): graph_engine = await get_graph_engine() - nodes_count = await graph_engine.count_nodes() + is_empty = await graph_engine.is_empty() - assert nodes_count == 0, "Kuzu graph database is not empty" + assert is_empty, "Kuzu graph database is not empty" await cognee.add([explanation_file_path_quantum], dataset_name) - nodes_count = await graph_engine.count_nodes() + is_empty = await graph_engine.is_empty() - assert nodes_count == 0, "Kuzu graph database should be empty before cognify" + assert is_empty, "Kuzu graph database should be empty before cognify" await cognee.cognify([dataset_name]) - nodes_count = await graph_engine.count_nodes() + is_empty = await graph_engine.is_empty() - assert nodes_count != 0, "Kuzu graph database should not be empty" + assert not is_empty, "Kuzu graph database should not be empty" from cognee.infrastructure.databases.vector import get_vector_engine @@ -131,9 +131,9 @@ async def main(): await cognee.prune.prune_system(metadata=True) - nodes_count = await graph_engine.count_nodes() + is_empty = await graph_engine.is_empty() - assert nodes_count == 0, "Kuzu graph database is not empty" + assert is_empty, "Kuzu graph database is not empty" finally: # Ensure cleanup even if tests fail diff --git a/cognee/tests/test_neo4j.py b/cognee/tests/test_neo4j.py index 6f1fcf975..925614e67 100644 --- a/cognee/tests/test_neo4j.py +++ b/cognee/tests/test_neo4j.py @@ -39,9 +39,9 @@ async def main(): graph_engine = await get_graph_engine() - nodes_count = await graph_engine.count_nodes() + is_empty = await graph_engine.is_empty() - assert nodes_count == 0, "Graph has to be empty" + assert is_empty, "Graph has to be empty" await cognee.add([explanation_file_path_nlp], dataset_name) @@ -50,15 +50,15 @@ async def main(): ) await cognee.add([explanation_file_path_quantum], dataset_name) - nodes_count = await graph_engine.count_nodes() + is_empty = await graph_engine.is_empty() - assert nodes_count == 0, "Graph has to be empty before cognify" + assert is_empty, "Graph has to be empty before cognify" await cognee.cognify([dataset_name]) - nodes_count = await graph_engine.count_nodes() + is_empty = await graph_engine.is_empty() - assert nodes_count != 0, "Graph shouldn't be empty" + assert not is_empty, "Graph shouldn't be empty" from cognee.infrastructure.databases.vector import get_vector_engine @@ -132,8 +132,8 @@ async def main(): assert not os.path.isdir(data_root_directory), "Local data files are not deleted" await cognee.prune.prune_system(metadata=True) - nodes_count = await graph_engine.count_nodes() - assert nodes_count == 0, "Neo4j graph database is not empty" + is_empty = await graph_engine.is_empty() + assert is_empty, "Neo4j graph database is not empty" if __name__ == "__main__": From c313fcd02924eff3a08a8129b3b3b14f93f67ca0 Mon Sep 17 00:00:00 2001 From: Daulet Amirkhanov Date: Fri, 17 Oct 2025 12:06:35 +0100 Subject: [PATCH 43/61] log warning on attempts to search on an empty knowledge graph --- cognee/api/v1/search/search.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cognee/api/v1/search/search.py b/cognee/api/v1/search/search.py index 0caca619a..9f158e9d0 100644 --- a/cognee/api/v1/search/search.py +++ b/cognee/api/v1/search/search.py @@ -9,6 +9,9 @@ from cognee.modules.users.methods import get_default_user from cognee.modules.search.methods import search as search_function from cognee.modules.data.methods import get_authorized_existing_datasets from cognee.modules.data.exceptions import DatasetNotFoundError +from cognee.shared.logging_utils import get_logger + +logger = get_logger() async def search( @@ -180,6 +183,7 @@ async def search( is_empty = await graph_engine.is_empty() if is_empty: + logger.warning("Search attempt on an empty knowledge graph") return [] filtered_search_results = await search_function( From 4e2a7778600bcea3992dbec4466939022e9b53c8 Mon Sep 17 00:00:00 2001 From: Daulet Amirkhanov Date: Fri, 17 Oct 2025 14:18:44 +0100 Subject: [PATCH 44/61] tests: update tests after last refactoring --- cognee/tests/unit/api/test_search.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/cognee/tests/unit/api/test_search.py b/cognee/tests/unit/api/test_search.py index aff9e5d38..54a4cc35f 100644 --- a/cognee/tests/unit/api/test_search.py +++ b/cognee/tests/unit/api/test_search.py @@ -1,6 +1,5 @@ import pytest import cognee -from cognee.modules.data.exceptions import SearchOnEmptyGraphError @pytest.mark.asyncio @@ -8,16 +7,15 @@ async def test_empty_search_raises_SearchOnEmptyGraphError_on_empty_graph(): await cognee.prune.prune_data() await cognee.prune.prune_system(metadata=True) await cognee.add("Sample input") - with pytest.raises(SearchOnEmptyGraphError): - await cognee.search("Sample query") + result = await cognee.search("Sample query") + assert result == [] +@pytest.mark.asyncio async def test_empty_search_doesnt_raise_SearchOnEmptyGraphError(): await cognee.prune.prune_data() await cognee.prune.prune_system(metadata=True) await cognee.add("Sample input") await cognee.cognify() - try: - await cognee.search("Sample query") - except SearchOnEmptyGraphError: - pytest.fail("Should not raise SearchOnEmptyGraphError when data was added and cognified") + result = await cognee.search("Sample query") + assert result != [] From 50aa8aac115f8fcf4011e1001e86adf9afc89594 Mon Sep 17 00:00:00 2001 From: Daulet Amirkhanov Date: Fri, 17 Oct 2025 17:33:25 +0100 Subject: [PATCH 45/61] refactor: remove `filestream` arg from `LoaderEngine.load_file(...)` --- cognee/infrastructure/loaders/LoaderEngine.py | 1 - .../tasks/ingestion/data_item_to_text_file.py | 18 +++++++++--------- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/cognee/infrastructure/loaders/LoaderEngine.py b/cognee/infrastructure/loaders/LoaderEngine.py index 87a008660..6b62f7641 100644 --- a/cognee/infrastructure/loaders/LoaderEngine.py +++ b/cognee/infrastructure/loaders/LoaderEngine.py @@ -105,7 +105,6 @@ class LoaderEngine: async def load_file( self, file_path: str, - file_stream: Optional[Any], preferred_loaders: Optional[List[str]] = None, **kwargs, ): diff --git a/cognee/tasks/ingestion/data_item_to_text_file.py b/cognee/tasks/ingestion/data_item_to_text_file.py index dc0d1d0a7..9fcafca57 100644 --- a/cognee/tasks/ingestion/data_item_to_text_file.py +++ b/cognee/tasks/ingestion/data_item_to_text_file.py @@ -48,17 +48,17 @@ async def data_item_to_text_file( await pull_from_s3(data_item_path, temp_file) temp_file.flush() # Data needs to be saved to local storage loader = get_loader_engine() - return await loader.load_file( - temp_file.name, None, preferred_loaders - ), loader.get_loader(temp_file.name, preferred_loaders) + return await loader.load_file(temp_file.name, preferred_loaders), loader.get_loader( + temp_file.name, preferred_loaders + ) # data is local file path elif parsed_url.scheme == "file": if settings.accept_local_file_path: loader = get_loader_engine() - return await loader.load_file( - data_item_path, None, preferred_loaders - ), loader.get_loader(data_item_path, preferred_loaders) + return await loader.load_file(data_item_path, preferred_loaders), loader.get_loader( + data_item_path, preferred_loaders + ) else: raise IngestionError(message="Local files are not accepted.") @@ -69,9 +69,9 @@ async def data_item_to_text_file( # Handle both Unix absolute paths (/path) and Windows absolute paths (C:\path) if settings.accept_local_file_path: loader = get_loader_engine() - return await loader.load_file( - data_item_path, None, preferred_loaders - ), loader.get_loader(data_item_path, preferred_loaders) + return await loader.load_file(data_item_path, preferred_loaders), loader.get_loader( + data_item_path, preferred_loaders + ) else: raise IngestionError(message="Local files are not accepted.") From 3f7efd8b888829d3e89a8120e4345782495ed3af Mon Sep 17 00:00:00 2001 From: vasilije Date: Sun, 19 Oct 2025 13:33:02 +0200 Subject: [PATCH 46/61] added fixes for tests --- .github/workflows/test_different_operating_systems.yml | 2 +- .github/workflows/test_suites.yml | 2 +- pyproject.toml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test_different_operating_systems.yml b/.github/workflows/test_different_operating_systems.yml index 00e387ac4..64f1a14f9 100644 --- a/.github/workflows/test_different_operating_systems.yml +++ b/.github/workflows/test_different_operating_systems.yml @@ -9,7 +9,7 @@ on: python-versions: required: false type: string - default: '["3.10.x", "3.11.x", "3.12.x"]' + default: '["3.10.x", "3.12.x", "3.13.x"]' secrets: LLM_PROVIDER: required: true diff --git a/.github/workflows/test_suites.yml b/.github/workflows/test_suites.yml index 2f1bdebf0..5c1597a93 100644 --- a/.github/workflows/test_suites.yml +++ b/.github/workflows/test_suites.yml @@ -85,7 +85,7 @@ jobs: needs: [basic-tests, e2e-tests] uses: ./.github/workflows/test_different_operating_systems.yml with: - python-versions: '["3.10.x", "3.11.x", "3.12.x"]' + python-versions: '["3.10.x", "3.11.x", "3.12.x", "3.13.x"]' secrets: inherit # Matrix-based vector database tests diff --git a/pyproject.toml b/pyproject.toml index 30889a61e..417786e90 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,7 +7,7 @@ authors = [ { name = "Vasilije Markovic" }, { name = "Boris Arzentar" }, ] -requires-python = ">=3.10,<=3.13" +requires-python = ">=3.10,<3.14" readme = "README.md" license = "Apache-2.0" classifiers = [ From 66876daf8581ef27d6fad1c50c17628f9a3f5d03 Mon Sep 17 00:00:00 2001 From: vasilije Date: Sun, 19 Oct 2025 14:38:34 +0200 Subject: [PATCH 47/61] removed docs --- .github/actions/cognee_setup/action.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/actions/cognee_setup/action.yml b/.github/actions/cognee_setup/action.yml index 4017d524b..06e5bae6b 100644 --- a/.github/actions/cognee_setup/action.yml +++ b/.github/actions/cognee_setup/action.yml @@ -41,4 +41,4 @@ runs: EXTRA_ARGS="$EXTRA_ARGS --extra $extra" done fi - uv sync --extra api --extra docs --extra evals --extra codegraph --extra ollama --extra dev --extra neo4j --extra redis $EXTRA_ARGS + uv sync --extra api --extra evals --extra codegraph --extra ollama --extra dev --extra neo4j --extra redis $EXTRA_ARGS From a1927548adf0ba4197251d8008daef19cfc4030b Mon Sep 17 00:00:00 2001 From: vasilije Date: Sun, 19 Oct 2025 14:52:02 +0200 Subject: [PATCH 48/61] added --- .github/actions/cognee_setup/action.yml | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/actions/cognee_setup/action.yml b/.github/actions/cognee_setup/action.yml index 06e5bae6b..4017d524b 100644 --- a/.github/actions/cognee_setup/action.yml +++ b/.github/actions/cognee_setup/action.yml @@ -41,4 +41,4 @@ runs: EXTRA_ARGS="$EXTRA_ARGS --extra $extra" done fi - uv sync --extra api --extra evals --extra codegraph --extra ollama --extra dev --extra neo4j --extra redis $EXTRA_ARGS + uv sync --extra api --extra docs --extra evals --extra codegraph --extra ollama --extra dev --extra neo4j --extra redis $EXTRA_ARGS diff --git a/pyproject.toml b/pyproject.toml index 417786e90..390028a6c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -101,7 +101,7 @@ chromadb = [ "chromadb>=0.6,<0.7", "pypika==0.48.9", ] -docs = ["unstructured[csv, doc, docx, epub, md, odt, org, ppt, pptx, rst, rtf, tsv, xlsx, pdf]>=0.18.1,<19"] +docs = ["lxml=6.0.2, unstructured[csv, doc, docx, epub, md, odt, org, ppt, pptx, rst, rtf, tsv, xlsx, pdf]>=0.18.1,<19"] codegraph = [ "fastembed<=0.6.0 ; python_version < '3.13'", "transformers>=4.46.3,<5", From 0c62916e75fac2281a6152ed84a74d476cb11437 Mon Sep 17 00:00:00 2001 From: vasilije Date: Sun, 19 Oct 2025 14:54:00 +0200 Subject: [PATCH 49/61] added --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 390028a6c..0f3c8c287 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -101,7 +101,7 @@ chromadb = [ "chromadb>=0.6,<0.7", "pypika==0.48.9", ] -docs = ["lxml=6.0.2, unstructured[csv, doc, docx, epub, md, odt, org, ppt, pptx, rst, rtf, tsv, xlsx, pdf]>=0.18.1,<19"] +docs = ["lxml==6.0.2, unstructured[csv, doc, docx, epub, md, odt, org, ppt, pptx, rst, rtf, tsv, xlsx, pdf]>=0.18.1,<19"] codegraph = [ "fastembed<=0.6.0 ; python_version < '3.13'", "transformers>=4.46.3,<5", From 8900b31decbac106ccb4b985c7d90590ad4d87ff Mon Sep 17 00:00:00 2001 From: vasilije Date: Sun, 19 Oct 2025 14:57:40 +0200 Subject: [PATCH 50/61] added --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 0f3c8c287..461aee301 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -101,7 +101,7 @@ chromadb = [ "chromadb>=0.6,<0.7", "pypika==0.48.9", ] -docs = ["lxml==6.0.2, unstructured[csv, doc, docx, epub, md, odt, org, ppt, pptx, rst, rtf, tsv, xlsx, pdf]>=0.18.1,<19"] +docs = ["lxml==6.0.2", "unstructured[csv, doc, docx, epub, md, odt, org, ppt, pptx, rst, rtf, tsv, xlsx, pdf]>=0.18.1,<19"] codegraph = [ "fastembed<=0.6.0 ; python_version < '3.13'", "transformers>=4.46.3,<5", From aa577d438444fd0e82f892c7b23f5ca2b04c5a65 Mon Sep 17 00:00:00 2001 From: vasilije Date: Sun, 19 Oct 2025 15:02:53 +0200 Subject: [PATCH 51/61] added --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 461aee301..dae648f80 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -65,10 +65,10 @@ distributed = [ "modal>=1.0.5,<2.0.0", ] scraping = [ - "tavily-python>=0.7.0", + "tavily-python>=0.7.12", "beautifulsoup4>=4.13.1", "playwright>=1.9.0", - "lxml>=4.9.3,<5.0.0", + "lxml>=4.9.3", "protego>=0.1", "APScheduler>=3.10.0,<=3.11.0" ] From 86ec2e9685aabbb0d54fc45a99c0ac131e3a89c4 Mon Sep 17 00:00:00 2001 From: vasilije Date: Sun, 19 Oct 2025 15:06:38 +0200 Subject: [PATCH 52/61] added --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index dae648f80..d4d8d535d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -101,7 +101,7 @@ chromadb = [ "chromadb>=0.6,<0.7", "pypika==0.48.9", ] -docs = ["lxml==6.0.2", "unstructured[csv, doc, docx, epub, md, odt, org, ppt, pptx, rst, rtf, tsv, xlsx, pdf]>=0.18.1,<19"] +docs = ["lxml<6.0.0", "unstructured[csv, doc, docx, epub, md, odt, org, ppt, pptx, rst, rtf, tsv, xlsx, pdf]>=0.18.1,<19"] codegraph = [ "fastembed<=0.6.0 ; python_version < '3.13'", "transformers>=4.46.3,<5", From cbfa360b8f7726c1eec9bfd97d8297f2024664e3 Mon Sep 17 00:00:00 2001 From: vasilije Date: Sun, 19 Oct 2025 15:26:06 +0200 Subject: [PATCH 53/61] added lock file --- poetry.lock | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/poetry.lock b/poetry.lock index 62ae7be8d..c974a0b43 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 2.1.3 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.1.2 and should not be changed by hand. [[package]] name = "accelerate" @@ -6633,7 +6633,7 @@ description = "Fundamental package for array computing in Python" optional = false python-versions = ">=3.11" groups = ["main"] -markers = "python_version == \"3.12\" or python_full_version == \"3.13.0\"" +markers = "python_version >= \"3.12\"" files = [ {file = "numpy-2.3.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:0ffc4f5caba7dfcbe944ed674b7eef683c7e94874046454bb79ed7ee0236f59d"}, {file = "numpy-2.3.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e7e946c7170858a0295f79a60214424caac2ffdb0063d4d79cb681f9aa0aa569"}, @@ -8532,7 +8532,6 @@ files = [ {file = "psycopg2-2.9.10-cp311-cp311-win_amd64.whl", hash = "sha256:0435034157049f6846e95103bd8f5a668788dd913a7c30162ca9503fdf542cb4"}, {file = "psycopg2-2.9.10-cp312-cp312-win32.whl", hash = "sha256:65a63d7ab0e067e2cdb3cf266de39663203d38d6a8ed97f5ca0cb315c73fe067"}, {file = "psycopg2-2.9.10-cp312-cp312-win_amd64.whl", hash = "sha256:4a579d6243da40a7b3182e0430493dbd55950c493d8c68f4eec0b302f6bbf20e"}, - {file = "psycopg2-2.9.10-cp313-cp313-win_amd64.whl", hash = "sha256:91fd603a2155da8d0cfcdbf8ab24a2d54bca72795b90d2a3ed2b6da8d979dee2"}, {file = "psycopg2-2.9.10-cp39-cp39-win32.whl", hash = "sha256:9d5b3b94b79a844a986d029eee38998232451119ad653aea42bb9220a8c5066b"}, {file = "psycopg2-2.9.10-cp39-cp39-win_amd64.whl", hash = "sha256:88138c8dedcbfa96408023ea2b0c369eda40fe5d75002c0964c78f46f11fa442"}, {file = "psycopg2-2.9.10.tar.gz", hash = "sha256:12ec0b40b0273f95296233e8750441339298e6a572f7039da5b260e3c8b60e11"}, @@ -11203,7 +11202,7 @@ description = "Easily download, build, install, upgrade, and uninstall Python pa optional = true python-versions = ">=3.9" groups = ["main"] -markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and (extra == \"docs\" or extra == \"docling\" or extra == \"notebook\" or extra == \"dev\" or extra == \"llama-index\" or extra == \"deepeval\" or extra == \"dlt\") or python_version == \"3.12\" and (extra == \"notebook\" or extra == \"dev\" or extra == \"llama-index\" or extra == \"deepeval\" or extra == \"dlt\" or extra == \"docs\" or extra == \"docling\") or python_full_version == \"3.13.0\" and (extra == \"notebook\" or extra == \"dev\" or extra == \"llama-index\" or extra == \"deepeval\" or extra == \"dlt\" or extra == \"docs\" or extra == \"docling\") or extra == \"notebook\" or extra == \"dev\" or extra == \"llama-index\" or extra == \"deepeval\" or extra == \"dlt\"" +markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and (extra == \"docs\" or extra == \"docling\" or extra == \"notebook\" or extra == \"dev\" or extra == \"llama-index\" or extra == \"deepeval\" or extra == \"dlt\") or python_version >= \"3.12\" and (extra == \"notebook\" or extra == \"dev\" or extra == \"llama-index\" or extra == \"deepeval\" or extra == \"dlt\" or extra == \"docs\" or extra == \"docling\") or extra == \"notebook\" or extra == \"dev\" or extra == \"llama-index\" or extra == \"deepeval\" or extra == \"dlt\"" files = [ {file = "setuptools-80.9.0-py3-none-any.whl", hash = "sha256:062d34222ad13e0cc312a4c02d73f059e86a4acbfbdea8f8f76b28c99f306922"}, {file = "setuptools-80.9.0.tar.gz", hash = "sha256:f36b47402ecde768dbfafc46e8e4207b4360c654f1f3bb84475f0a28628fb19c"}, @@ -13510,7 +13509,7 @@ dev = ["coverage", "deptry", "gitpython", "mkdocs-material", "mkdocs-minify-plug distributed = ["modal"] dlt = ["dlt"] docling = ["docling", "transformers"] -docs = ["unstructured"] +docs = ["lxml", "unstructured"] evals = ["gdown", "matplotlib", "pandas", "plotly", "scikit-learn"] graphiti = ["graphiti-core"] groq = ["groq"] @@ -13531,5 +13530,5 @@ scraping = ["APScheduler", "beautifulsoup4", "lxml", "playwright", "protego", "t [metadata] lock-version = "2.1" -python-versions = ">=3.10,<=3.13" -content-hash = "8d8172ac8ddc3c30ca79a1677ecf2a28897d52c0a564d8fb5646c8565c313a0f" +python-versions = ">=3.10,<3.14" +content-hash = "bcab5420339473ec08b89cde588899b60999762fb8ca9a011240d47ea86198e3" From 04719129a64809e28ed9c5e0af40dcd77a2e32dc Mon Sep 17 00:00:00 2001 From: vasilije Date: Sun, 19 Oct 2025 15:53:38 +0200 Subject: [PATCH 54/61] updated env template --- .env.template | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.env.template b/.env.template index 3137636d3..89ac06830 100644 --- a/.env.template +++ b/.env.template @@ -247,10 +247,10 @@ LITELLM_LOG="ERROR" #LLM_PROVIDER="ollama" #LLM_ENDPOINT="http://localhost:11434/v1" #EMBEDDING_PROVIDER="ollama" -#EMBEDDING_MODEL="avr/sfr-embedding-mistral:latest" +#EMBEDDING_MODEL="nomic-embed-text:latest" #EMBEDDING_ENDPOINT="http://localhost:11434/api/embeddings" -#EMBEDDING_DIMENSIONS=4096 -#HUGGINGFACE_TOKENIZER="Salesforce/SFR-Embedding-Mistral" +#EMBEDDING_DIMENSIONS=768 +#HUGGINGFACE_TOKENIZER="nomic-ai/nomic-embed-text-v1.5" ########## OpenRouter (also free) ######################################################### From 400095d76df23c33b7c4783654d381255459d0a4 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Sun, 19 Oct 2025 21:30:13 +0200 Subject: [PATCH 55/61] fix: Resolve issue with multi-user mode search --- cognee/api/v1/search/search.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/cognee/api/v1/search/search.py b/cognee/api/v1/search/search.py index 9f158e9d0..4051bae86 100644 --- a/cognee/api/v1/search/search.py +++ b/cognee/api/v1/search/search.py @@ -179,13 +179,6 @@ async def search( if not datasets: raise DatasetNotFoundError(message="No datasets found.") - graph_engine = await get_graph_engine() - is_empty = await graph_engine.is_empty() - - if is_empty: - logger.warning("Search attempt on an empty knowledge graph") - return [] - filtered_search_results = await search_function( query_text=query_text, query_type=query_type, From f88277c467e81f3d63b0e2f713be3d06c3c19276 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Sun, 19 Oct 2025 23:10:53 +0200 Subject: [PATCH 56/61] fix: Resolve issue with plain text files not having magic file info --- cognee/infrastructure/files/utils/guess_file_type.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/cognee/infrastructure/files/utils/guess_file_type.py b/cognee/infrastructure/files/utils/guess_file_type.py index edd2d89b0..dcdd68cad 100644 --- a/cognee/infrastructure/files/utils/guess_file_type.py +++ b/cognee/infrastructure/files/utils/guess_file_type.py @@ -124,6 +124,12 @@ def guess_file_type(file: BinaryIO) -> filetype.Type: """ file_type = filetype.guess(file) + # If file type could not be determined consider it a plain text file as they don't have magic number encoding + if file_type is None: + from filetype.types.base import Type + + file_type = Type("text/plain", "txt") + if file_type is None: raise FileTypeException(f"Unknown file detected: {file.name}.") From 8c627d9e10df49d8c2315592b664081fab45e486 Mon Sep 17 00:00:00 2001 From: Hande <159312713+hande-k@users.noreply.github.com> Date: Mon, 20 Oct 2025 12:03:40 +0200 Subject: [PATCH 57/61] chore: update colab notebook on README --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index a1eebae73..305bffdfe 100644 --- a/README.md +++ b/README.md @@ -71,7 +71,7 @@ Build dynamic memory for Agents and replace RAG using scalable, modular ECL (Ext ## Get Started -Get started quickly with a Google Colab notebook , Deepnote notebook or starter repo +Get started quickly with a Google Colab notebook , Deepnote notebook or starter repo ## About cognee @@ -224,12 +224,12 @@ We now have a paper you can cite: ```bibtex @misc{markovic2025optimizinginterfaceknowledgegraphs, - title={Optimizing the Interface Between Knowledge Graphs and LLMs for Complex Reasoning}, + title={Optimizing the Interface Between Knowledge Graphs and LLMs for Complex Reasoning}, author={Vasilije Markovic and Lazar Obradovic and Laszlo Hajdu and Jovan Pavlovic}, year={2025}, eprint={2505.24478}, archivePrefix={arXiv}, primaryClass={cs.AI}, - url={https://arxiv.org/abs/2505.24478}, + url={https://arxiv.org/abs/2505.24478}, } ``` From 279d6e80f03420838ae9e4ca81648563290d4d36 Mon Sep 17 00:00:00 2001 From: Daulet Amirkhanov Date: Mon, 20 Oct 2025 11:56:15 +0100 Subject: [PATCH 58/61] Revert "fix: search without prior cognify" --- cognee/api/v1/search/search.py | 4 ---- .../databases/graph/graph_db_interface.py | 5 ---- .../databases/graph/kuzu/adapter.py | 9 -------- .../databases/graph/neo4j_driver/adapter.py | 9 -------- cognee/tests/test_kuzu.py | 23 ++++--------------- cognee/tests/test_neo4j.py | 22 ++++-------------- cognee/tests/unit/api/test_search.py | 21 ----------------- 7 files changed, 9 insertions(+), 84 deletions(-) delete mode 100644 cognee/tests/unit/api/test_search.py diff --git a/cognee/api/v1/search/search.py b/cognee/api/v1/search/search.py index 4051bae86..0a9e76e96 100644 --- a/cognee/api/v1/search/search.py +++ b/cognee/api/v1/search/search.py @@ -1,7 +1,6 @@ from uuid import UUID from typing import Union, Optional, List, Type -from cognee.infrastructure.databases.graph import get_graph_engine from cognee.modules.engine.models.node_set import NodeSet from cognee.modules.users.models import User from cognee.modules.search.types import SearchResult, SearchType, CombinedSearchResult @@ -9,9 +8,6 @@ from cognee.modules.users.methods import get_default_user from cognee.modules.search.methods import search as search_function from cognee.modules.data.methods import get_authorized_existing_datasets from cognee.modules.data.exceptions import DatasetNotFoundError -from cognee.shared.logging_utils import get_logger - -logger = get_logger() async def search( diff --git a/cognee/infrastructure/databases/graph/graph_db_interface.py b/cognee/infrastructure/databases/graph/graph_db_interface.py index 67df1a27c..65afdf275 100644 --- a/cognee/infrastructure/databases/graph/graph_db_interface.py +++ b/cognee/infrastructure/databases/graph/graph_db_interface.py @@ -159,11 +159,6 @@ class GraphDBInterface(ABC): - get_connections """ - @abstractmethod - async def is_empty(self) -> bool: - logger.warning("is_empty() is not implemented") - return True - @abstractmethod async def query(self, query: str, params: dict) -> List[Any]: """ diff --git a/cognee/infrastructure/databases/graph/kuzu/adapter.py b/cognee/infrastructure/databases/graph/kuzu/adapter.py index 2d3866888..3f0fb0c57 100644 --- a/cognee/infrastructure/databases/graph/kuzu/adapter.py +++ b/cognee/infrastructure/databases/graph/kuzu/adapter.py @@ -198,15 +198,6 @@ class KuzuAdapter(GraphDBInterface): except FileNotFoundError: logger.warning(f"Kuzu S3 storage file not found: {self.db_path}") - async def is_empty(self) -> bool: - query = """ - MATCH (n) - RETURN true - LIMIT 1; - """ - query_result = await self.query(query) - return len(query_result) == 0 - async def query(self, query: str, params: Optional[dict] = None) -> List[Tuple]: """ Execute a Kuzu query asynchronously with automatic reconnection. diff --git a/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py b/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py index 5861b69cb..520295ed2 100644 --- a/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py +++ b/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py @@ -87,15 +87,6 @@ class Neo4jAdapter(GraphDBInterface): async with self.driver.session(database=self.graph_database_name) as session: yield session - async def is_empty(self) -> bool: - query = """ - RETURN EXISTS { - MATCH (n) - } AS node_exists; - """ - query_result = await self.query(query) - return not query_result[0]["node_exists"] - @deadlock_retry() async def query( self, diff --git a/cognee/tests/test_kuzu.py b/cognee/tests/test_kuzu.py index fe9da6dcb..8749e42d0 100644 --- a/cognee/tests/test_kuzu.py +++ b/cognee/tests/test_kuzu.py @@ -47,26 +47,10 @@ async def main(): pathlib.Path(__file__).parent, "test_data/Quantum_computers.txt" ) - from cognee.infrastructure.databases.graph import get_graph_engine - - graph_engine = await get_graph_engine() - - is_empty = await graph_engine.is_empty() - - assert is_empty, "Kuzu graph database is not empty" - await cognee.add([explanation_file_path_quantum], dataset_name) - is_empty = await graph_engine.is_empty() - - assert is_empty, "Kuzu graph database should be empty before cognify" - await cognee.cognify([dataset_name]) - is_empty = await graph_engine.is_empty() - - assert not is_empty, "Kuzu graph database should not be empty" - from cognee.infrastructure.databases.vector import get_vector_engine vector_engine = get_vector_engine() @@ -130,10 +114,11 @@ async def main(): assert not os.path.isdir(data_root_directory), "Local data files are not deleted" await cognee.prune.prune_system(metadata=True) + from cognee.infrastructure.databases.graph import get_graph_engine - is_empty = await graph_engine.is_empty() - - assert is_empty, "Kuzu graph database is not empty" + graph_engine = await get_graph_engine() + nodes, edges = await graph_engine.get_graph_data() + assert len(nodes) == 0 and len(edges) == 0, "Kuzu graph database is not empty" finally: # Ensure cleanup even if tests fail diff --git a/cognee/tests/test_neo4j.py b/cognee/tests/test_neo4j.py index 925614e67..c74b4ab65 100644 --- a/cognee/tests/test_neo4j.py +++ b/cognee/tests/test_neo4j.py @@ -35,14 +35,6 @@ async def main(): explanation_file_path_nlp = os.path.join( pathlib.Path(__file__).parent, "test_data/Natural_language_processing.txt" ) - from cognee.infrastructure.databases.graph import get_graph_engine - - graph_engine = await get_graph_engine() - - is_empty = await graph_engine.is_empty() - - assert is_empty, "Graph has to be empty" - await cognee.add([explanation_file_path_nlp], dataset_name) explanation_file_path_quantum = os.path.join( @@ -50,16 +42,9 @@ async def main(): ) await cognee.add([explanation_file_path_quantum], dataset_name) - is_empty = await graph_engine.is_empty() - - assert is_empty, "Graph has to be empty before cognify" await cognee.cognify([dataset_name]) - is_empty = await graph_engine.is_empty() - - assert not is_empty, "Graph shouldn't be empty" - from cognee.infrastructure.databases.vector import get_vector_engine vector_engine = get_vector_engine() @@ -132,8 +117,11 @@ async def main(): assert not os.path.isdir(data_root_directory), "Local data files are not deleted" await cognee.prune.prune_system(metadata=True) - is_empty = await graph_engine.is_empty() - assert is_empty, "Neo4j graph database is not empty" + from cognee.infrastructure.databases.graph import get_graph_engine + + graph_engine = await get_graph_engine() + nodes, edges = await graph_engine.get_graph_data() + assert len(nodes) == 0 and len(edges) == 0, "Neo4j graph database is not empty" if __name__ == "__main__": diff --git a/cognee/tests/unit/api/test_search.py b/cognee/tests/unit/api/test_search.py deleted file mode 100644 index 54a4cc35f..000000000 --- a/cognee/tests/unit/api/test_search.py +++ /dev/null @@ -1,21 +0,0 @@ -import pytest -import cognee - - -@pytest.mark.asyncio -async def test_empty_search_raises_SearchOnEmptyGraphError_on_empty_graph(): - await cognee.prune.prune_data() - await cognee.prune.prune_system(metadata=True) - await cognee.add("Sample input") - result = await cognee.search("Sample query") - assert result == [] - - -@pytest.mark.asyncio -async def test_empty_search_doesnt_raise_SearchOnEmptyGraphError(): - await cognee.prune.prune_data() - await cognee.prune.prune_system(metadata=True) - await cognee.add("Sample input") - await cognee.cognify() - result = await cognee.search("Sample query") - assert result != [] From 3e54b67b4d7f20c385afad0bc878943df9a0b86c Mon Sep 17 00:00:00 2001 From: Igor Ilic <30923996+dexters1@users.noreply.github.com> Date: Mon, 20 Oct 2025 15:03:35 +0200 Subject: [PATCH 59/61] fix: Resolve missing argument for distributed (#1563) ## Description Resolve missing argument for distributed ## Type of Change - [ ] Bug fix (non-breaking change that fixes an issue) - [ ] New feature (non-breaking change that adds functionality) - [ ] Breaking change (fix or feature that would cause existing functionality to change) - [ ] Documentation update - [ ] Code refactoring - [ ] Performance improvement - [ ] Other (please specify): ## Screenshots/Videos (if applicable) ## Pre-submission Checklist - [ ] **I have tested my changes thoroughly before submitting this PR** - [ ] **This PR contains minimal changes necessary to address the issue/feature** - [ ] My code follows the project's coding standards and style guidelines - [ ] I have added tests that prove my fix is effective or that my feature works - [ ] I have added necessary documentation (if applicable) - [ ] All new and existing tests pass - [ ] I have searched existing PRs to ensure this change hasn't been submitted already - [ ] I have linked any relevant issues in the description - [ ] My commits have clear and descriptive messages ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin. Co-authored-by: hajdul88 <52442977+hajdul88@users.noreply.github.com> --- cognee/modules/pipelines/operations/run_tasks_distributed.py | 1 + 1 file changed, 1 insertion(+) diff --git a/cognee/modules/pipelines/operations/run_tasks_distributed.py b/cognee/modules/pipelines/operations/run_tasks_distributed.py index 95cdb0266..3fce3763d 100644 --- a/cognee/modules/pipelines/operations/run_tasks_distributed.py +++ b/cognee/modules/pipelines/operations/run_tasks_distributed.py @@ -88,6 +88,7 @@ async def run_tasks_distributed( pipeline_name: str = "unknown_pipeline", context: dict = None, incremental_loading: bool = False, + data_per_batch: int = 20, ): if not user: user = await get_default_user() From df038365c848775229e1c9255d56992352b1990e Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Mon, 20 Oct 2025 17:27:49 +0200 Subject: [PATCH 60/61] fix: fixes id in get_filtered_graph_data (#1569) ## Description Fixes get_filtered_graph_data method in neo4jAdapter. ## Type of Change - [x] Bug fix (non-breaking change that fixes an issue) - [ ] New feature (non-breaking change that adds functionality) - [ ] Breaking change (fix or feature that would cause existing functionality to change) - [ ] Documentation update - [ ] Code refactoring - [ ] Performance improvement - [ ] Other (please specify): ## Screenshots/Videos (if applicable) ## Pre-submission Checklist - [x] **I have tested my changes thoroughly before submitting this PR** - [x] **This PR contains minimal changes necessary to address the issue/feature** - [x] My code follows the project's coding standards and style guidelines - [x] I have added tests that prove my fix is effective or that my feature works - [x] I have added necessary documentation (if applicable) - [x] All new and existing tests pass - [x] I have searched existing PRs to ensure this change hasn't been submitted already - [x] I have linked any relevant issues in the description - [x] My commits have clear and descriptive messages ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin. --- cognee/infrastructure/databases/graph/neo4j_driver/adapter.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py b/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py index 520295ed2..365d02979 100644 --- a/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py +++ b/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py @@ -1067,7 +1067,7 @@ class Neo4jAdapter(GraphDBInterface): query_nodes = f""" MATCH (n) WHERE {where_clause} - RETURN ID(n) AS id, labels(n) AS labels, properties(n) AS properties + RETURN n.id AS id, labels(n) AS labels, properties(n) AS properties """ result_nodes = await self.query(query_nodes) @@ -1082,7 +1082,7 @@ class Neo4jAdapter(GraphDBInterface): query_edges = f""" MATCH (n)-[r]->(m) WHERE {where_clause} AND {where_clause.replace("n.", "m.")} - RETURN ID(n) AS source, ID(m) AS target, TYPE(r) AS type, properties(r) AS properties + RETURN n.id AS source, n.id AS target, TYPE(r) AS type, properties(r) AS properties """ result_edges = await self.query(query_edges) From 612a2252ce012fc8929ffe6523ed6bc948a4db55 Mon Sep 17 00:00:00 2001 From: vasilije Date: Tue, 21 Oct 2025 07:22:52 +0200 Subject: [PATCH 61/61] fix --- poetry.lock | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/poetry.lock b/poetry.lock index 80263027e..2773e61b9 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 2.1.3 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.1.2 and should not be changed by hand. [[package]] name = "accelerate" @@ -4366,6 +4366,8 @@ groups = ["main"] markers = "extra == \"dlt\"" files = [ {file = "jsonpath-ng-1.7.0.tar.gz", hash = "sha256:f6f5f7fd4e5ff79c785f1573b394043b39849fb2bb47bcead935d12b00beab3c"}, + {file = "jsonpath_ng-1.7.0-py2-none-any.whl", hash = "sha256:898c93fc173f0c336784a3fa63d7434297544b7198124a68f9a3ef9597b0ae6e"}, + {file = "jsonpath_ng-1.7.0-py3-none-any.whl", hash = "sha256:f3d7f9e848cba1b6da28c55b1c26ff915dc9e0b1ba7e752a53d6da8d5cbd00b6"}, ] [package.dependencies] @@ -10208,6 +10210,13 @@ optional = false python-versions = ">=3.8" groups = ["main"] files = [ + {file = "PyYAML-6.0.3-cp38-cp38-macosx_10_13_x86_64.whl", hash = "sha256:c2514fceb77bc5e7a2f7adfaa1feb2fb311607c9cb518dbc378688ec73d8292f"}, + {file = "PyYAML-6.0.3-cp38-cp38-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9c57bb8c96f6d1808c030b1687b9b5fb476abaa47f0db9c0101f5e9f394e97f4"}, + {file = "PyYAML-6.0.3-cp38-cp38-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:efd7b85f94a6f21e4932043973a7ba2613b059c4a000551892ac9f1d11f5baf3"}, + {file = "PyYAML-6.0.3-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:22ba7cfcad58ef3ecddc7ed1db3409af68d023b7f940da23c6c2a1890976eda6"}, + {file = "PyYAML-6.0.3-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:6344df0d5755a2c9a276d4473ae6b90647e216ab4757f8426893b5dd2ac3f369"}, + {file = "PyYAML-6.0.3-cp38-cp38-win32.whl", hash = "sha256:3ff07ec89bae51176c0549bc4c63aa6202991da2d9a6129d7aef7f1407d3f295"}, + {file = "PyYAML-6.0.3-cp38-cp38-win_amd64.whl", hash = "sha256:5cf4e27da7e3fbed4d6c3d8e797387aaad68102272f8f9752883bc32d61cb87b"}, {file = "pyyaml-6.0.3-cp310-cp310-macosx_10_13_x86_64.whl", hash = "sha256:214ed4befebe12df36bcc8bc2b64b396ca31be9304b8f59e25c11cf94a4c033b"}, {file = "pyyaml-6.0.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:02ea2dfa234451bbb8772601d7b8e426c2bfa197136796224e50e35a78777956"}, {file = "pyyaml-6.0.3-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b30236e45cf30d2b8e7b3e85881719e98507abed1011bf463a8fa23e9c3e98a8"},