<!-- .github/pull_request_template.md -->
## Description
Make the pipeline cache mechanism optional, have it turned off by
default but use it for add and cognify like it has been used until now
## Type of Change
<!-- Please check the relevant option -->
- [ ] Bug fix (non-breaking change that fixes an issue)
- [x] New feature (non-breaking change that adds functionality)
- [ ] Breaking change (fix or feature that would cause existing
functionality to change)
- [ ] Documentation update
- [ ] Code refactoring
- [ ] Performance improvement
- [ ] Other (please specify):
## Pre-submission Checklist
<!-- Please check all boxes that apply before submitting your PR -->
- [x] **I have tested my changes thoroughly before submitting this PR**
- [x] **This PR contains minimal changes necessary to address the
issue/feature**
- [x] My code follows the project's coding standards and style
guidelines
- [x] I have added tests that prove my fix is effective or that my
feature works
- [x] I have added necessary documentation (if applicable)
- [x] All new and existing tests pass
- [ x I have searched existing PRs to ensure this change hasn't been
submitted already
- [x] I have linked any relevant issues in the description
- [x] My commits have clear and descriptive messages
## DCO Affirmation
I affirm that all code in every commit of this pull request conforms to
the terms of the Topoteretes Developer Certificate of Origin.
<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
## Summary by CodeRabbit
* **New Features**
* Introduced pipeline caching across ingestion, processing, and custom
pipeline flows with per-run controls to enable or disable caching.
* Added an option for incremental loading in custom pipeline runs.
* **Behavior Changes**
* One pipeline path now explicitly bypasses caching by default to always
re-run when invoked.
* Disabling cache forces re-processing instead of early exit; cache
reset still enables re-execution.
* **Tests**
* Added tests validating caching, non-caching, and cache-reset
re-execution behavior.
* **Chores**
* Added CI job to run pipeline caching tests.
<sub>✏️ Tip: You can customize this high-level summary in your review
settings.</sub>
<!-- end of auto-generated comment: release notes by coderabbit.ai -->
164 lines
5.6 KiB
Python
164 lines
5.6 KiB
Python
"""
|
|
Test suite for the pipeline_cache feature in Cognee pipelines.
|
|
|
|
This module tests the behavior of the `pipeline_cache` parameter which controls
|
|
whether a pipeline should skip re-execution when it has already been completed
|
|
for the same dataset.
|
|
|
|
Architecture Overview:
|
|
---------------------
|
|
The pipeline_cache mechanism works at the dataset level:
|
|
1. When a pipeline runs, it logs its status (INITIATED -> STARTED -> COMPLETED)
|
|
2. Before each run, `check_pipeline_run_qualification()` checks the pipeline status
|
|
3. If `use_pipeline_cache=True` and status is COMPLETED/STARTED, the pipeline skips
|
|
4. If `use_pipeline_cache=False`, the pipeline always re-executes regardless of status
|
|
"""
|
|
|
|
import pytest
|
|
|
|
import cognee
|
|
from cognee.modules.pipelines.tasks.task import Task
|
|
from cognee.modules.pipelines import run_pipeline
|
|
from cognee.modules.users.methods import get_default_user
|
|
|
|
from cognee.modules.pipelines.layers.reset_dataset_pipeline_run_status import (
|
|
reset_dataset_pipeline_run_status,
|
|
)
|
|
from cognee.infrastructure.databases.relational import create_db_and_tables
|
|
|
|
|
|
class ExecutionCounter:
|
|
"""Helper class to track task execution counts."""
|
|
|
|
def __init__(self):
|
|
self.count = 0
|
|
|
|
|
|
async def create_counting_task(data, counter: ExecutionCounter):
|
|
"""Create a task that increments a counter from the ExecutionCounter instance when executed."""
|
|
counter.count += 1
|
|
return counter
|
|
|
|
|
|
class TestPipelineCache:
|
|
"""Tests for basic pipeline_cache on/off behavior."""
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_pipeline_cache_off_allows_reexecution(self):
|
|
"""
|
|
Test that with use_pipeline_cache=False, the pipeline re-executes
|
|
even when it has already completed for the dataset.
|
|
|
|
Expected behavior:
|
|
- First run: Pipeline executes fully, task runs once
|
|
- Second run: Pipeline executes again, task runs again (total: 2 times)
|
|
"""
|
|
await cognee.prune.prune_data()
|
|
await cognee.prune.prune_system(metadata=True)
|
|
await create_db_and_tables()
|
|
|
|
counter = ExecutionCounter()
|
|
user = await get_default_user()
|
|
|
|
tasks = [Task(create_counting_task, counter=counter)]
|
|
|
|
# First run
|
|
pipeline_results_1 = []
|
|
async for result in run_pipeline(
|
|
tasks=tasks,
|
|
datasets="test_dataset_cache_off",
|
|
data=["sample data"], # Data is necessary to trigger processing
|
|
user=user,
|
|
pipeline_name="test_cache_off_pipeline",
|
|
use_pipeline_cache=False,
|
|
):
|
|
pipeline_results_1.append(result)
|
|
|
|
first_run_count = counter.count
|
|
assert first_run_count >= 1, "Task should have executed at least once on first run"
|
|
|
|
# Second run with pipeline_cache=False
|
|
pipeline_results_2 = []
|
|
async for result in run_pipeline(
|
|
tasks=tasks,
|
|
datasets="test_dataset_cache_off",
|
|
data=["sample data"], # Data is necessary to trigger processing
|
|
user=user,
|
|
pipeline_name="test_cache_off_pipeline",
|
|
use_pipeline_cache=False,
|
|
):
|
|
pipeline_results_2.append(result)
|
|
|
|
second_run_count = counter.count
|
|
assert second_run_count > first_run_count, (
|
|
f"With pipeline_cache=False, task should re-execute. "
|
|
f"First run: {first_run_count}, After second run: {second_run_count}"
|
|
)
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_reset_pipeline_status_allows_reexecution_with_cache(self):
|
|
"""
|
|
Test that resetting pipeline status allows re-execution even with
|
|
pipeline_cache=True.
|
|
"""
|
|
await cognee.prune.prune_data()
|
|
await cognee.prune.prune_system(metadata=True)
|
|
await create_db_and_tables()
|
|
|
|
counter = ExecutionCounter()
|
|
user = await get_default_user()
|
|
dataset_name = "reset_status_test"
|
|
pipeline_name = "test_reset_pipeline"
|
|
|
|
tasks = [Task(create_counting_task, counter=counter)]
|
|
|
|
# First run
|
|
pipeline_result = []
|
|
async for result in run_pipeline(
|
|
tasks=tasks,
|
|
datasets=dataset_name,
|
|
user=user,
|
|
data=["sample data"], # Data is necessary to trigger processing
|
|
pipeline_name=pipeline_name,
|
|
use_pipeline_cache=True,
|
|
):
|
|
pipeline_result.append(result)
|
|
|
|
first_run_count = counter.count
|
|
assert first_run_count >= 1
|
|
|
|
# Second run without reset - should skip
|
|
async for _ in run_pipeline(
|
|
tasks=tasks,
|
|
datasets=dataset_name,
|
|
user=user,
|
|
data=["sample data"], # Data is necessary to trigger processing
|
|
pipeline_name=pipeline_name,
|
|
use_pipeline_cache=True,
|
|
):
|
|
pass
|
|
|
|
after_second_run = counter.count
|
|
assert after_second_run == first_run_count, "Should have skipped due to cache"
|
|
|
|
# Reset the pipeline status
|
|
await reset_dataset_pipeline_run_status(
|
|
pipeline_result[0].dataset_id, user, pipeline_names=[pipeline_name]
|
|
)
|
|
|
|
# Third run after reset - should execute
|
|
async for _ in run_pipeline(
|
|
tasks=tasks,
|
|
datasets=dataset_name,
|
|
user=user,
|
|
data=["sample data"], # Data is necessary to trigger processing
|
|
pipeline_name=pipeline_name,
|
|
use_pipeline_cache=True,
|
|
):
|
|
pass
|
|
|
|
after_reset_run = counter.count
|
|
assert after_reset_run > after_second_run, (
|
|
f"After reset, pipeline should re-execute. "
|
|
f"Before reset: {after_second_run}, After reset run: {after_reset_run}"
|
|
)
|