Merge branch 'dev' into feature/cog-3532-empower-test_search-db-retrievers-tests-reorg

2025-12-12 14:25:01 +01:00 · 2025-12-12 14:25:01 +01:00 · 3a48930c3b
commit 3a48930c3b
parent 3ce5b2da4c 127d9860df
15 changed files with 302 additions and 25 deletions
--- a/.github/workflows/e2e_tests.yml
+++ b/.github/workflows/e2e_tests.yml
@ -582,3 +582,30 @@ jobs:
          DB_USERNAME: cognee
          DB_PASSWORD: cognee
        run: uv run python ./cognee/tests/test_conversation_history.py
+
+  run-pipeline-cache-test:
+    name: Test Pipeline Caching
+    runs-on: ubuntu-22.04
+    steps:
+      - name: Check out
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Cognee Setup
+        uses: ./.github/actions/cognee_setup
+        with:
+          python-version: '3.11.x'
+
+      - name: Run Pipeline Cache Test
+        env:
+          ENV: 'dev'
+          LLM_MODEL: ${{ secrets.LLM_MODEL }}
+          LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
+          LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
+          LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }}
+          EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }}
+          EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }}
+          EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
+          EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
+        run: uv run python ./cognee/tests/test_pipeline_cache.py
--- a/alembic/versions/46a6ce2bd2b2_expand_dataset_database_with_json_.py
+++ b/alembic/versions/46a6ce2bd2b2_expand_dataset_database_with_json_.py
@ -49,6 +49,20 @@ def _recreate_table_without_unique_constraint_sqlite(op, insp):
        sa.Column("graph_database_name", sa.String(), nullable=False),
        sa.Column("vector_database_provider", sa.String(), nullable=False),
        sa.Column("graph_database_provider", sa.String(), nullable=False),
+        sa.Column(
+            "vector_dataset_database_handler",
+            sa.String(),
+            unique=False,
+            nullable=False,
+            server_default="lancedb",
+        ),
+        sa.Column(
+            "graph_dataset_database_handler",
+            sa.String(),
+            unique=False,
+            nullable=False,
+            server_default="kuzu",
+        ),
        sa.Column("vector_database_url", sa.String()),
        sa.Column("graph_database_url", sa.String()),
        sa.Column("vector_database_key", sa.String()),
@ -82,6 +96,8 @@ def _recreate_table_without_unique_constraint_sqlite(op, insp):
            graph_database_name,
            vector_database_provider,
            graph_database_provider,
+            vector_dataset_database_handler,
+            graph_dataset_database_handler,
            vector_database_url,
            graph_database_url,
            vector_database_key,
@ -120,6 +136,20 @@ def _recreate_table_with_unique_constraint_sqlite(op, insp):
        sa.Column("graph_database_name", sa.String(), nullable=False, unique=True),
        sa.Column("vector_database_provider", sa.String(), nullable=False),
        sa.Column("graph_database_provider", sa.String(), nullable=False),
+        sa.Column(
+            "vector_dataset_database_handler",
+            sa.String(),
+            unique=False,
+            nullable=False,
+            server_default="lancedb",
+        ),
+        sa.Column(
+            "graph_dataset_database_handler",
+            sa.String(),
+            unique=False,
+            nullable=False,
+            server_default="kuzu",
+        ),
        sa.Column("vector_database_url", sa.String()),
        sa.Column("graph_database_url", sa.String()),
        sa.Column("vector_database_key", sa.String()),
@ -153,6 +183,8 @@ def _recreate_table_with_unique_constraint_sqlite(op, insp):
            graph_database_name,
            vector_database_provider,
            graph_database_provider,
+            vector_dataset_database_handler,
+            graph_dataset_database_handler,
            vector_database_url,
            graph_database_url,
            vector_database_key,
@ -193,6 +225,22 @@ def upgrade() -> None:
            ),
        )

+    vector_dataset_database_handler = _get_column(
+        insp, "dataset_database", "vector_dataset_database_handler"
+    )
+    if not vector_dataset_database_handler:
+        # Add LanceDB as the default graph dataset database handler
+        op.add_column(
+            "dataset_database",
+            sa.Column(
+                "vector_dataset_database_handler",
+                sa.String(),
+                unique=False,
+                nullable=False,
+                server_default="lancedb",
+            ),
+        )
+
    graph_database_connection_info_column = _get_column(
        insp, "dataset_database", "graph_database_connection_info"
    )
@ -208,6 +256,22 @@ def upgrade() -> None:
            ),
        )

+    graph_dataset_database_handler = _get_column(
+        insp, "dataset_database", "graph_dataset_database_handler"
+    )
+    if not graph_dataset_database_handler:
+        # Add Kuzu as the default graph dataset database handler
+        op.add_column(
+            "dataset_database",
+            sa.Column(
+                "graph_dataset_database_handler",
+                sa.String(),
+                unique=False,
+                nullable=False,
+                server_default="kuzu",
+            ),
+        )
+
    with op.batch_alter_table("dataset_database", schema=None) as batch_op:
        # Drop the unique constraint to make unique=False
        graph_constraint_to_drop = None
@ -265,3 +329,5 @@ def downgrade() -> None:

    op.drop_column("dataset_database", "vector_database_connection_info")
    op.drop_column("dataset_database", "graph_database_connection_info")
+    op.drop_column("dataset_database", "vector_dataset_database_handler")
+    op.drop_column("dataset_database", "graph_dataset_database_handler")
--- a/cognee/api/v1/add/add.py
+++ b/cognee/api/v1/add/add.py
@ -205,6 +205,7 @@ async def add(
        pipeline_name="add_pipeline",
        vector_db_config=vector_db_config,
        graph_db_config=graph_db_config,
+        use_pipeline_cache=True,
        incremental_loading=incremental_loading,
        data_per_batch=data_per_batch,
    ):
--- a/cognee/api/v1/cognify/cognify.py
+++ b/cognee/api/v1/cognify/cognify.py
@ -237,6 +237,7 @@ async def cognify(
        vector_db_config=vector_db_config,
        graph_db_config=graph_db_config,
        incremental_loading=incremental_loading,
+        use_pipeline_cache=True,
        pipeline_name="cognify_pipeline",
        data_per_batch=data_per_batch,
    )
--- a/cognee/infrastructure/databases/graph/kuzu/KuzuDatasetDatabaseHandler.py
+++ b/cognee/infrastructure/databases/graph/kuzu/KuzuDatasetDatabaseHandler.py
@ -47,6 +47,7 @@ class KuzuDatasetDatabaseHandler(DatasetDatabaseHandlerInterface):
            "graph_database_url": graph_db_url,
            "graph_database_provider": graph_config.graph_database_provider,
            "graph_database_key": graph_db_key,
+            "graph_dataset_database_handler": "kuzu",
            "graph_database_connection_info": {
                "graph_database_username": graph_db_username,
                "graph_database_password": graph_db_password,
--- a/cognee/infrastructure/databases/graph/neo4j_driver/Neo4jAuraDevDatasetDatabaseHandler.py
+++ b/cognee/infrastructure/databases/graph/neo4j_driver/Neo4jAuraDevDatasetDatabaseHandler.py
@ -131,6 +131,7 @@ class Neo4jAuraDevDatasetDatabaseHandler(DatasetDatabaseHandlerInterface):
            "graph_database_url": graph_db_url,
            "graph_database_provider": "neo4j",
            "graph_database_key": graph_db_key,
+            "graph_dataset_database_handler": "neo4j_aura_dev",
            "graph_database_connection_info": {
                "graph_database_username": graph_db_username,
                "graph_database_password": encrypted_db_password_string,
--- a/cognee/infrastructure/databases/utils/resolve_dataset_database_connection_info.py
+++ b/cognee/infrastructure/databases/utils/resolve_dataset_database_connection_info.py
@ -1,27 +1,21 @@
-from cognee.infrastructure.databases.vector import get_vectordb_config
-from cognee.infrastructure.databases.graph.config import get_graph_config
 from cognee.modules.users.models.DatasetDatabase import DatasetDatabase


 async def _get_vector_db_connection_info(dataset_database: DatasetDatabase) -> DatasetDatabase:
-    vector_config = get_vectordb_config()
-
    from cognee.infrastructure.databases.dataset_database_handler.supported_dataset_database_handlers import (
        supported_dataset_database_handlers,
    )

-    handler = supported_dataset_database_handlers[vector_config.vector_dataset_database_handler]
+    handler = supported_dataset_database_handlers[dataset_database.vector_dataset_database_handler]
    return await handler["handler_instance"].resolve_dataset_connection_info(dataset_database)


 async def _get_graph_db_connection_info(dataset_database: DatasetDatabase) -> DatasetDatabase:
-    graph_config = get_graph_config()
-
    from cognee.infrastructure.databases.dataset_database_handler.supported_dataset_database_handlers import (
        supported_dataset_database_handlers,
    )

-    handler = supported_dataset_database_handlers[graph_config.graph_dataset_database_handler]
+    handler = supported_dataset_database_handlers[dataset_database.graph_dataset_database_handler]
    return await handler["handler_instance"].resolve_dataset_connection_info(dataset_database)


--- a/cognee/infrastructure/databases/vector/lancedb/LanceDBDatasetDatabaseHandler.py
+++ b/cognee/infrastructure/databases/vector/lancedb/LanceDBDatasetDatabaseHandler.py
@ -36,6 +36,7 @@ class LanceDBDatasetDatabaseHandler(DatasetDatabaseHandlerInterface):
            "vector_database_url": os.path.join(databases_directory_path, vector_db_name),
            "vector_database_key": vector_config.vector_db_key,
            "vector_database_name": vector_db_name,
+            "vector_dataset_database_handler": "lancedb",
        }

    @classmethod
--- a/cognee/modules/data/deletion/prune_system.py
+++ b/cognee/modules/data/deletion/prune_system.py
@ -5,8 +5,6 @@ from cognee.context_global_variables import backend_access_control_enabled
 from cognee.infrastructure.databases.vector import get_vector_engine
 from cognee.infrastructure.databases.graph.get_graph_engine import get_graph_engine
 from cognee.infrastructure.databases.relational import get_relational_engine
-from cognee.infrastructure.databases.vector.config import get_vectordb_config
-from cognee.infrastructure.databases.graph.config import get_graph_config
 from cognee.shared.cache import delete_cache
 from cognee.modules.users.models import DatasetDatabase
 from cognee.shared.logging_utils import get_logger
@ -16,12 +14,13 @@ logger = get_logger()

 async def prune_graph_databases():
    async def _prune_graph_db(dataset_database: DatasetDatabase) -> dict:
-        graph_config = get_graph_config()
        from cognee.infrastructure.databases.dataset_database_handler.supported_dataset_database_handlers import (
            supported_dataset_database_handlers,
        )

-        handler = supported_dataset_database_handlers[graph_config.graph_dataset_database_handler]
+        handler = supported_dataset_database_handlers[
+            dataset_database.graph_dataset_database_handler
+        ]
        return await handler["handler_instance"].delete_dataset(dataset_database)

    db_engine = get_relational_engine()
@ -40,13 +39,13 @@ async def prune_graph_databases():

 async def prune_vector_databases():
    async def _prune_vector_db(dataset_database: DatasetDatabase) -> dict:
-        vector_config = get_vectordb_config()
-
        from cognee.infrastructure.databases.dataset_database_handler.supported_dataset_database_handlers import (
            supported_dataset_database_handlers,
        )

-        handler = supported_dataset_database_handlers[vector_config.vector_dataset_database_handler]
+        handler = supported_dataset_database_handlers[
+            dataset_database.vector_dataset_database_handler
+        ]
        return await handler["handler_instance"].delete_dataset(dataset_database)

    db_engine = get_relational_engine()
--- a/cognee/modules/memify/memify.py
+++ b/cognee/modules/memify/memify.py
@ -12,9 +12,6 @@ from cognee.modules.users.models import User
 from cognee.modules.pipelines.layers.resolve_authorized_user_datasets import (
    resolve_authorized_user_datasets,
 )
-from cognee.modules.pipelines.layers.reset_dataset_pipeline_run_status import (
-    reset_dataset_pipeline_run_status,
-)
 from cognee.modules.engine.operations.setup import setup
 from cognee.modules.pipelines.layers.pipeline_execution_mode import get_pipeline_executor
 from cognee.tasks.memify.extract_subgraph_chunks import extract_subgraph_chunks
@ -97,10 +94,6 @@ async def memify(
        *enrichment_tasks,
    ]

-    await reset_dataset_pipeline_run_status(
-        authorized_dataset.id, user, pipeline_names=["memify_pipeline"]
-    )
-
    # By calling get pipeline executor we get a function that will have the run_pipeline run in the background or a function that we will need to wait for
    pipeline_executor_func = get_pipeline_executor(run_in_background=run_in_background)

@ -113,6 +106,7 @@ async def memify(
        datasets=authorized_dataset.id,
        vector_db_config=vector_db_config,
        graph_db_config=graph_db_config,
+        use_pipeline_cache=False,
        incremental_loading=False,
        pipeline_name="memify_pipeline",
    )
--- a/cognee/modules/pipelines/operations/pipeline.py
+++ b/cognee/modules/pipelines/operations/pipeline.py
@ -20,6 +20,9 @@ from cognee.modules.pipelines.layers.resolve_authorized_user_datasets import (
 from cognee.modules.pipelines.layers.check_pipeline_run_qualification import (
    check_pipeline_run_qualification,
 )
+from cognee.modules.pipelines.models.PipelineRunInfo import (
+    PipelineRunStarted,
+)
 from typing import Any

 logger = get_logger("cognee.pipeline")
@ -35,6 +38,7 @@ async def run_pipeline(
    pipeline_name: str = "custom_pipeline",
    vector_db_config: dict = None,
    graph_db_config: dict = None,
+    use_pipeline_cache: bool = False,
    incremental_loading: bool = False,
    data_per_batch: int = 20,
 ):
@ -51,6 +55,7 @@ async def run_pipeline(
            data=data,
            pipeline_name=pipeline_name,
            context={"dataset": dataset},
+            use_pipeline_cache=use_pipeline_cache,
            incremental_loading=incremental_loading,
            data_per_batch=data_per_batch,
        ):
@ -64,6 +69,7 @@ async def run_pipeline_per_dataset(
    data=None,
    pipeline_name: str = "custom_pipeline",
    context: dict = None,
+    use_pipeline_cache=False,
    incremental_loading=False,
    data_per_batch: int = 20,
 ):
@ -77,8 +83,18 @@ async def run_pipeline_per_dataset(
    if process_pipeline_status:
        # If pipeline was already processed or is currently being processed
        # return status information to async generator and finish execution
-        yield process_pipeline_status
-        return
+        if use_pipeline_cache:
+            # If pipeline caching is enabled we do not proceed with re-processing
+            yield process_pipeline_status
+            return
+        else:
+            # If pipeline caching is disabled we always return pipeline started information and proceed with re-processing
+            yield PipelineRunStarted(
+                pipeline_run_id=process_pipeline_status.pipeline_run_id,
+                dataset_id=dataset.id,
+                dataset_name=dataset.name,
+                payload=data,
+            )

    pipeline_run = run_tasks(
        tasks,
--- a/cognee/modules/run_custom_pipeline/run_custom_pipeline.py
+++ b/cognee/modules/run_custom_pipeline/run_custom_pipeline.py
@ -18,6 +18,8 @@ async def run_custom_pipeline(
    user: User = None,
    vector_db_config: Optional[dict] = None,
    graph_db_config: Optional[dict] = None,
+    use_pipeline_cache: bool = False,
+    incremental_loading: bool = False,
    data_per_batch: int = 20,
    run_in_background: bool = False,
    pipeline_name: str = "custom_pipeline",
@ -40,6 +42,10 @@ async def run_custom_pipeline(
        user: User context for authentication and data access. Uses default if None.
        vector_db_config: Custom vector database configuration for embeddings storage.
        graph_db_config: Custom graph database configuration for relationship storage.
+        use_pipeline_cache: If True, pipelines with the same ID that are currently executing and pipelines with the same ID that were completed won't process data again.
+                        Pipelines ID is created based on the generate_pipeline_id function. Pipeline status can be manually reset with the reset_dataset_pipeline_run_status function.
+        incremental_loading: If True, only new or modified data will be processed to avoid duplication. (Only works if data is used with the Cognee python Data model).
+                            The incremental system stores and compares hashes of processed data in the Data model and skips data with the same content hash.
        data_per_batch: Number of data items to be processed in parallel.
        run_in_background: If True, starts processing asynchronously and returns immediately.
                          If False, waits for completion before returning.
@ -63,7 +69,8 @@ async def run_custom_pipeline(
        datasets=dataset,
        vector_db_config=vector_db_config,
        graph_db_config=graph_db_config,
-        incremental_loading=False,
+        use_pipeline_cache=use_pipeline_cache,
+        incremental_loading=incremental_loading,
        data_per_batch=data_per_batch,
        pipeline_name=pipeline_name,
    )
--- a/cognee/modules/users/models/DatasetDatabase.py
+++ b/cognee/modules/users/models/DatasetDatabase.py
@ -18,6 +18,9 @@ class DatasetDatabase(Base):
    vector_database_provider = Column(String, unique=False, nullable=False)
    graph_database_provider = Column(String, unique=False, nullable=False)

+    graph_dataset_database_handler = Column(String, unique=False, nullable=False)
+    vector_dataset_database_handler = Column(String, unique=False, nullable=False)
+
    vector_database_url = Column(String, unique=False, nullable=True)
    graph_database_url = Column(String, unique=False, nullable=True)

--- a/cognee/tests/test_dataset_database_handler.py
+++ b/cognee/tests/test_dataset_database_handler.py
@ -30,6 +30,7 @@ class LanceDBTestDatasetDatabaseHandler(DatasetDatabaseHandlerInterface):
        vector_db_name = "test.lance.db"

        return {
+            "vector_dataset_database_handler": "custom_lancedb_handler",
            "vector_database_name": vector_db_name,
            "vector_database_url": os.path.join(databases_directory_path, vector_db_name),
            "vector_database_provider": "lancedb",
@ -44,6 +45,7 @@ class KuzuTestDatasetDatabaseHandler(DatasetDatabaseHandlerInterface):

        graph_db_name = "test.kuzu"
        return {
+            "graph_dataset_database_handler": "custom_kuzu_handler",
            "graph_database_name": graph_db_name,
            "graph_database_url": os.path.join(databases_directory_path, graph_db_name),
            "graph_database_provider": "kuzu",
--- a/cognee/tests/test_pipeline_cache.py
+++ b/cognee/tests/test_pipeline_cache.py
@ -0,0 +1,164 @@
+"""
+Test suite for the pipeline_cache feature in Cognee pipelines.
+
+This module tests the behavior of the `pipeline_cache` parameter which controls
+whether a pipeline should skip re-execution when it has already been completed
+for the same dataset.
+
+Architecture Overview:
+---------------------
+The pipeline_cache mechanism works at the dataset level:
+1. When a pipeline runs, it logs its status (INITIATED -> STARTED -> COMPLETED)
+2. Before each run, `check_pipeline_run_qualification()` checks the pipeline status
+3. If `use_pipeline_cache=True` and status is COMPLETED/STARTED, the pipeline skips
+4. If `use_pipeline_cache=False`, the pipeline always re-executes regardless of status
+"""
+
+import pytest
+
+import cognee
+from cognee.modules.pipelines.tasks.task import Task
+from cognee.modules.pipelines import run_pipeline
+from cognee.modules.users.methods import get_default_user
+
+from cognee.modules.pipelines.layers.reset_dataset_pipeline_run_status import (
+    reset_dataset_pipeline_run_status,
+)
+from cognee.infrastructure.databases.relational import create_db_and_tables
+
+
+class ExecutionCounter:
+    """Helper class to track task execution counts."""
+
+    def __init__(self):
+        self.count = 0
+
+
+async def create_counting_task(data, counter: ExecutionCounter):
+    """Create a task that increments a counter from the ExecutionCounter instance when executed."""
+    counter.count += 1
+    return counter
+
+
+class TestPipelineCache:
+    """Tests for basic pipeline_cache on/off behavior."""
+
+    @pytest.mark.asyncio
+    async def test_pipeline_cache_off_allows_reexecution(self):
+        """
+        Test that with use_pipeline_cache=False, the pipeline re-executes
+        even when it has already completed for the dataset.
+
+        Expected behavior:
+        - First run: Pipeline executes fully, task runs once
+        - Second run: Pipeline executes again, task runs again (total: 2 times)
+        """
+        await cognee.prune.prune_data()
+        await cognee.prune.prune_system(metadata=True)
+        await create_db_and_tables()
+
+        counter = ExecutionCounter()
+        user = await get_default_user()
+
+        tasks = [Task(create_counting_task, counter=counter)]
+
+        # First run
+        pipeline_results_1 = []
+        async for result in run_pipeline(
+            tasks=tasks,
+            datasets="test_dataset_cache_off",
+            data=["sample data"],  # Data is necessary to trigger processing
+            user=user,
+            pipeline_name="test_cache_off_pipeline",
+            use_pipeline_cache=False,
+        ):
+            pipeline_results_1.append(result)
+
+        first_run_count = counter.count
+        assert first_run_count >= 1, "Task should have executed at least once on first run"
+
+        # Second run with pipeline_cache=False
+        pipeline_results_2 = []
+        async for result in run_pipeline(
+            tasks=tasks,
+            datasets="test_dataset_cache_off",
+            data=["sample data"],  # Data is necessary to trigger processing
+            user=user,
+            pipeline_name="test_cache_off_pipeline",
+            use_pipeline_cache=False,
+        ):
+            pipeline_results_2.append(result)
+
+        second_run_count = counter.count
+        assert second_run_count > first_run_count, (
+            f"With pipeline_cache=False, task should re-execute. "
+            f"First run: {first_run_count}, After second run: {second_run_count}"
+        )
+
+    @pytest.mark.asyncio
+    async def test_reset_pipeline_status_allows_reexecution_with_cache(self):
+        """
+        Test that resetting pipeline status allows re-execution even with
+        pipeline_cache=True.
+        """
+        await cognee.prune.prune_data()
+        await cognee.prune.prune_system(metadata=True)
+        await create_db_and_tables()
+
+        counter = ExecutionCounter()
+        user = await get_default_user()
+        dataset_name = "reset_status_test"
+        pipeline_name = "test_reset_pipeline"
+
+        tasks = [Task(create_counting_task, counter=counter)]
+
+        # First run
+        pipeline_result = []
+        async for result in run_pipeline(
+            tasks=tasks,
+            datasets=dataset_name,
+            user=user,
+            data=["sample data"],  # Data is necessary to trigger processing
+            pipeline_name=pipeline_name,
+            use_pipeline_cache=True,
+        ):
+            pipeline_result.append(result)
+
+        first_run_count = counter.count
+        assert first_run_count >= 1
+
+        # Second run without reset - should skip
+        async for _ in run_pipeline(
+            tasks=tasks,
+            datasets=dataset_name,
+            user=user,
+            data=["sample data"],  # Data is necessary to trigger processing
+            pipeline_name=pipeline_name,
+            use_pipeline_cache=True,
+        ):
+            pass
+
+        after_second_run = counter.count
+        assert after_second_run == first_run_count, "Should have skipped due to cache"
+
+        # Reset the pipeline status
+        await reset_dataset_pipeline_run_status(
+            pipeline_result[0].dataset_id, user, pipeline_names=[pipeline_name]
+        )
+
+        # Third run after reset - should execute
+        async for _ in run_pipeline(
+            tasks=tasks,
+            datasets=dataset_name,
+            user=user,
+            data=["sample data"],  # Data is necessary to trigger processing
+            pipeline_name=pipeline_name,
+            use_pipeline_cache=True,
+        ):
+            pass
+
+        after_reset_run = counter.count
+        assert after_reset_run > after_second_run, (
+            f"After reset, pipeline should re-execute. "
+            f"Before reset: {after_second_run}, After reset run: {after_reset_run}"
+        )