Incremental loading migration (#1238)

## Description Add relational db migration for incremental loading, change incremental loading to work document per document instead of async together ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin.
2025-08-13 13:58:09 +02:00 · 2025-08-13 13:58:09 +02:00 · beea2f5e0a
commit beea2f5e0a
parent fd9aaf57b1
2 changed files with 90 additions and 17 deletions
--- a/alembic/versions/1daae0df1866_incremental_loading.py
+++ b/alembic/versions/1daae0df1866_incremental_loading.py
@ -0,0 +1,48 @@
+"""incremental_loading
+
+Revision ID: 1daae0df1866
+Revises: b9274c27a25a
+Create Date: 2025-08-12 13:14:12.515935
+
+"""
+
+from typing import Sequence, Union
+
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.ext.mutable import MutableDict
+
+# revision identifiers, used by Alembic.
+revision: str = "1daae0df1866"
+down_revision: Union[str, None] = "b9274c27a25a"
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def _get_column(inspector, table, name, schema=None):
+    for col in inspector.get_columns(table, schema=schema):
+        if col["name"] == name:
+            return col
+    return None
+
+
+def upgrade() -> None:
+    conn = op.get_bind()
+    insp = sa.inspect(conn)
+
+    # If column already exists skip migration
+    pipeline_status_column = _get_column(insp, "data", "pipeline_status")
+    if not pipeline_status_column:
+        op.add_column(
+            "data",
+            sa.Column(
+                "pipeline_status",
+                MutableDict.as_mutable(sa.JSON),
+                nullable=False,
+                server_default=sa.text("'{}'"),
+            ),
+        )
+
+
+def downgrade() -> None:
+    op.drop_column("data", "pipeline_status")
--- a/cognee/modules/pipelines/operations/run_tasks.py
+++ b/cognee/modules/pipelines/operations/run_tasks.py
@ -263,24 +263,49 @@ async def run_tasks(
        if incremental_loading:
            data = await resolve_data_directories(data)

-        # Create async tasks per data item that will run the pipeline for the data item
-        data_item_tasks = [
-            asyncio.create_task(
-                _run_tasks_data_item(
-                    data_item,
-                    dataset,
-                    tasks,
-                    pipeline_name,
-                    pipeline_id,
-                    pipeline_run_id,
-                    context,
-                    user,
-                    incremental_loading,
-                )
+        # TODO: Return to using async.gather for data items after Cognee release
+        # # Create async tasks per data item that will run the pipeline for the data item
+        # data_item_tasks = [
+        #     asyncio.create_task(
+        #         _run_tasks_data_item(
+        #             data_item,
+        #             dataset,
+        #             tasks,
+        #             pipeline_name,
+        #             pipeline_id,
+        #             pipeline_run_id,
+        #             context,
+        #             user,
+        #             incremental_loading,
+        #         )
+        #     )
+        #     for data_item in data
+        # ]
+        # results = await asyncio.gather(*data_item_tasks)
+        # # Remove skipped data items from results
+        # results = [result for result in results if result]
+
+        ### TEMP sync data item handling
+        results = []
+        # Run the pipeline for each data_item sequentially, one after the other
+        for data_item in data:
+            result = await _run_tasks_data_item(
+                data_item,
+                dataset,
+                tasks,
+                pipeline_name,
+                pipeline_id,
+                pipeline_run_id,
+                context,
+                user,
+                incremental_loading,
            )
-            for data_item in data
-        ]
-        results = await asyncio.gather(*data_item_tasks)
+
+            # Skip items that returned a false-y value
+            if result:
+                results.append(result)
+        ### END
+
        # Remove skipped data items from results
        results = [result for result in results if result]