cognee/cognee/modules/pipelines/operations/run_tasks_distributed.py
2025-10-30 21:19:15 +01:00

172 lines
5.3 KiB
Python

try:
import modal
except ModuleNotFoundError:
modal = None
from typing import Any, List, Optional
from uuid import UUID
from cognee.modules.data.methods import get_dataset
from cognee.modules.data.models import Dataset
from cognee.modules.pipelines.tasks.task import Task
from cognee.infrastructure.databases.relational import get_relational_engine
from cognee.modules.pipelines.models import (
PipelineRunStarted,
PipelineRunCompleted,
PipelineRunErrored,
)
from cognee.modules.pipelines.operations import (
log_pipeline_run_start,
log_pipeline_run_complete,
log_pipeline_run_error,
)
from cognee.modules.pipelines.utils import generate_pipeline_id
from cognee.modules.users.methods import get_default_user
from cognee.shared.logging_utils import get_logger
from cognee.modules.users.models import User
from cognee.modules.pipelines.exceptions import PipelineRunFailedError
from cognee.tasks.ingestion import resolve_data_directories
from .run_tasks_data_item import run_tasks_data_item
logger = get_logger("run_tasks_distributed()")
if modal:
import os
from distributed.app import app
from distributed.modal_image import image
secret_name = os.environ.get("MODAL_SECRET_NAME", "distributed_cognee")
@app.function(
retries=3,
image=image,
timeout=86400,
max_containers=50,
secrets=[modal.Secret.from_name(secret_name)],
)
async def run_tasks_on_modal(
data_item,
dataset: Dataset,
tasks: List[Task],
pipeline_name: str,
pipeline_id: str,
pipeline_run_id: str,
context: Optional[dict],
user: User,
incremental_loading: bool,
):
"""
Wrapper that runs the run_tasks_data_item function.
This is the function/code that runs on modal executor and produces the graph/vector db objects
"""
from cognee.infrastructure.databases.relational import get_relational_engine
result = await run_tasks_data_item(
data_item=data_item,
dataset=dataset,
tasks=tasks,
pipeline_name=pipeline_name,
pipeline_id=pipeline_id,
pipeline_run_id=pipeline_run_id,
context={
**(context or {}),
"user": user,
"data": data_item,
"dataset": dataset,
},
user=user,
incremental_loading=incremental_loading,
)
return result
async def run_tasks_distributed(
tasks: List[Task],
dataset_id: UUID,
data: Optional[List[Any]] = None,
user: Optional[User] = None,
pipeline_name: str = "unknown_pipeline",
context: Optional[dict] = None,
incremental_loading: bool = False,
data_per_batch: int = 20,
):
if not user:
user = await get_default_user()
dataset = await get_dataset(user.id, dataset_id)
pipeline_id: UUID = generate_pipeline_id(user.id, dataset.id, pipeline_name)
pipeline_run = await log_pipeline_run_start(pipeline_id, pipeline_name, dataset.id, data)
pipeline_run_id: UUID = pipeline_run.pipeline_run_id
yield PipelineRunStarted(
pipeline_run_id=pipeline_run_id,
dataset_id=dataset.id,
dataset_name=dataset.name,
payload=data,
)
try:
if not isinstance(data, list):
data = [data]
data = await resolve_data_directories(data)
number_of_data_items = len(data) if isinstance(data, list) else 1
data_item_tasks = [
data,
[dataset] * number_of_data_items,
[tasks] * number_of_data_items,
[pipeline_name] * number_of_data_items,
[pipeline_id] * number_of_data_items,
[pipeline_run_id] * number_of_data_items,
[context] * number_of_data_items,
[user] * number_of_data_items,
[incremental_loading] * number_of_data_items,
]
results = []
async for result in run_tasks_on_modal.map.aio(*data_item_tasks):
if not result:
continue
results.append(result)
# Remove skipped results
results = [r for r in results if r]
# If any data item failed, raise PipelineRunFailedError
errored = [
r
for r in results
if r and r.get("run_info") and isinstance(r["run_info"], PipelineRunErrored)
]
if errored:
raise PipelineRunFailedError("Pipeline run failed. Data item could not be processed.")
await log_pipeline_run_complete(
pipeline_run_id, pipeline_id, pipeline_name, dataset.id, data
)
yield PipelineRunCompleted(
pipeline_run_id=pipeline_run_id,
dataset_id=dataset.id,
dataset_name=dataset.name,
data_ingestion_info=results,
)
except Exception as error:
await log_pipeline_run_error(
pipeline_run_id, pipeline_id, pipeline_name, dataset.id, data, error
)
yield PipelineRunErrored(
pipeline_run_id=pipeline_run_id,
payload=repr(error),
dataset_id=dataset.id,
dataset_name=dataset.name,
data_ingestion_info=locals().get("results"),
)
if not isinstance(error, PipelineRunFailedError):
raise