cognee/cognee/tests/test_pipeline_cache.py
Igor Ilic ede884e0b0
feat: make pipeline processing cache optional (#1876)
<!-- .github/pull_request_template.md -->

## Description
Make the pipeline cache mechanism optional, have it turned off by
default but use it for add and cognify like it has been used until now

## Type of Change
<!-- Please check the relevant option -->
- [ ] Bug fix (non-breaking change that fixes an issue)
- [x] New feature (non-breaking change that adds functionality)
- [ ] Breaking change (fix or feature that would cause existing
functionality to change)
- [ ] Documentation update
- [ ] Code refactoring
- [ ] Performance improvement
- [ ] Other (please specify):

## Pre-submission Checklist
<!-- Please check all boxes that apply before submitting your PR -->
- [x] **I have tested my changes thoroughly before submitting this PR**
- [x] **This PR contains minimal changes necessary to address the
issue/feature**
- [x] My code follows the project's coding standards and style
guidelines
- [x] I have added tests that prove my fix is effective or that my
feature works
- [x] I have added necessary documentation (if applicable)
- [x] All new and existing tests pass
- [ x I have searched existing PRs to ensure this change hasn't been
submitted already
- [x] I have linked any relevant issues in the description
- [x] My commits have clear and descriptive messages

## DCO Affirmation
I affirm that all code in every commit of this pull request conforms to
the terms of the Topoteretes Developer Certificate of Origin.


<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
## Summary by CodeRabbit

* **New Features**
* Introduced pipeline caching across ingestion, processing, and custom
pipeline flows with per-run controls to enable or disable caching.
  * Added an option for incremental loading in custom pipeline runs.

* **Behavior Changes**
* One pipeline path now explicitly bypasses caching by default to always
re-run when invoked.
* Disabling cache forces re-processing instead of early exit; cache
reset still enables re-execution.

* **Tests**
* Added tests validating caching, non-caching, and cache-reset
re-execution behavior.

* **Chores**
  * Added CI job to run pipeline caching tests.

<sub>✏️ Tip: You can customize this high-level summary in your review
settings.</sub>
<!-- end of auto-generated comment: release notes by coderabbit.ai -->
2025-12-12 13:11:31 +01:00

164 lines
5.6 KiB
Python

"""
Test suite for the pipeline_cache feature in Cognee pipelines.
This module tests the behavior of the `pipeline_cache` parameter which controls
whether a pipeline should skip re-execution when it has already been completed
for the same dataset.
Architecture Overview:
---------------------
The pipeline_cache mechanism works at the dataset level:
1. When a pipeline runs, it logs its status (INITIATED -> STARTED -> COMPLETED)
2. Before each run, `check_pipeline_run_qualification()` checks the pipeline status
3. If `use_pipeline_cache=True` and status is COMPLETED/STARTED, the pipeline skips
4. If `use_pipeline_cache=False`, the pipeline always re-executes regardless of status
"""
import pytest
import cognee
from cognee.modules.pipelines.tasks.task import Task
from cognee.modules.pipelines import run_pipeline
from cognee.modules.users.methods import get_default_user
from cognee.modules.pipelines.layers.reset_dataset_pipeline_run_status import (
reset_dataset_pipeline_run_status,
)
from cognee.infrastructure.databases.relational import create_db_and_tables
class ExecutionCounter:
"""Helper class to track task execution counts."""
def __init__(self):
self.count = 0
async def create_counting_task(data, counter: ExecutionCounter):
"""Create a task that increments a counter from the ExecutionCounter instance when executed."""
counter.count += 1
return counter
class TestPipelineCache:
"""Tests for basic pipeline_cache on/off behavior."""
@pytest.mark.asyncio
async def test_pipeline_cache_off_allows_reexecution(self):
"""
Test that with use_pipeline_cache=False, the pipeline re-executes
even when it has already completed for the dataset.
Expected behavior:
- First run: Pipeline executes fully, task runs once
- Second run: Pipeline executes again, task runs again (total: 2 times)
"""
await cognee.prune.prune_data()
await cognee.prune.prune_system(metadata=True)
await create_db_and_tables()
counter = ExecutionCounter()
user = await get_default_user()
tasks = [Task(create_counting_task, counter=counter)]
# First run
pipeline_results_1 = []
async for result in run_pipeline(
tasks=tasks,
datasets="test_dataset_cache_off",
data=["sample data"], # Data is necessary to trigger processing
user=user,
pipeline_name="test_cache_off_pipeline",
use_pipeline_cache=False,
):
pipeline_results_1.append(result)
first_run_count = counter.count
assert first_run_count >= 1, "Task should have executed at least once on first run"
# Second run with pipeline_cache=False
pipeline_results_2 = []
async for result in run_pipeline(
tasks=tasks,
datasets="test_dataset_cache_off",
data=["sample data"], # Data is necessary to trigger processing
user=user,
pipeline_name="test_cache_off_pipeline",
use_pipeline_cache=False,
):
pipeline_results_2.append(result)
second_run_count = counter.count
assert second_run_count > first_run_count, (
f"With pipeline_cache=False, task should re-execute. "
f"First run: {first_run_count}, After second run: {second_run_count}"
)
@pytest.mark.asyncio
async def test_reset_pipeline_status_allows_reexecution_with_cache(self):
"""
Test that resetting pipeline status allows re-execution even with
pipeline_cache=True.
"""
await cognee.prune.prune_data()
await cognee.prune.prune_system(metadata=True)
await create_db_and_tables()
counter = ExecutionCounter()
user = await get_default_user()
dataset_name = "reset_status_test"
pipeline_name = "test_reset_pipeline"
tasks = [Task(create_counting_task, counter=counter)]
# First run
pipeline_result = []
async for result in run_pipeline(
tasks=tasks,
datasets=dataset_name,
user=user,
data=["sample data"], # Data is necessary to trigger processing
pipeline_name=pipeline_name,
use_pipeline_cache=True,
):
pipeline_result.append(result)
first_run_count = counter.count
assert first_run_count >= 1
# Second run without reset - should skip
async for _ in run_pipeline(
tasks=tasks,
datasets=dataset_name,
user=user,
data=["sample data"], # Data is necessary to trigger processing
pipeline_name=pipeline_name,
use_pipeline_cache=True,
):
pass
after_second_run = counter.count
assert after_second_run == first_run_count, "Should have skipped due to cache"
# Reset the pipeline status
await reset_dataset_pipeline_run_status(
pipeline_result[0].dataset_id, user, pipeline_names=[pipeline_name]
)
# Third run after reset - should execute
async for _ in run_pipeline(
tasks=tasks,
datasets=dataset_name,
user=user,
data=["sample data"], # Data is necessary to trigger processing
pipeline_name=pipeline_name,
use_pipeline_cache=True,
):
pass
after_reset_run = counter.count
assert after_reset_run > after_second_run, (
f"After reset, pipeline should re-execute. "
f"Before reset: {after_second_run}, After reset run: {after_reset_run}"
)