feat: incremental load initial commit

This commit is contained in:
Igor Ilic 2025-07-10 20:46:36 +02:00
parent dad1e27052
commit af2b0735f6
9 changed files with 96 additions and 33 deletions

View file

@ -242,6 +242,7 @@ async def run_cognify_as_background_process(
async def handle_rest_of_the_run(pipeline_list): async def handle_rest_of_the_run(pipeline_list):
# Execute all provided pipelines one by one to avoid database write conflicts # Execute all provided pipelines one by one to avoid database write conflicts
# TODO: Convert to async gather task instead of for loop when Queue mechanism for database is created
for pipeline in pipeline_list: for pipeline in pipeline_list:
while True: while True:
try: try:

View file

@ -24,6 +24,7 @@ class Data(Base):
token_count = Column(Integer) token_count = Column(Integer)
created_at = Column(DateTime(timezone=True), default=lambda: datetime.now(timezone.utc)) created_at = Column(DateTime(timezone=True), default=lambda: datetime.now(timezone.utc))
updated_at = Column(DateTime(timezone=True), onupdate=lambda: datetime.now(timezone.utc)) updated_at = Column(DateTime(timezone=True), onupdate=lambda: datetime.now(timezone.utc))
pipeline_status = Column(JSON)
datasets = relationship( datasets = relationship(
"Dataset", "Dataset",

View file

@ -0,0 +1,2 @@
from .get_s3_fs import get_s3_fs
from .open_data_file import open_data_file

View file

@ -0,0 +1,14 @@
from cognee.api.v1.add.config import get_s3_config
def get_s3_fs():
s3_config = get_s3_config()
fs = None
if s3_config.aws_access_key_id is not None and s3_config.aws_secret_access_key is not None:
import s3fs
fs = s3fs.S3FileSystem(
key=s3_config.aws_access_key_id, secret=s3_config.aws_secret_access_key, anon=False
)
return fs

View file

@ -0,0 +1,6 @@
def open_data_file(file_path: str, s3fs):
if file_path.startswith("s3://"):
return s3fs.open(file_path, mode="rb")
else:
local_path = file_path.replace("file://", "")
return open(local_path, mode="rb")

View file

@ -9,6 +9,7 @@ class PipelineRunInfo(BaseModel):
dataset_id: UUID dataset_id: UUID
dataset_name: str dataset_name: str
payload: Optional[Any] = None payload: Optional[Any] = None
data_ingestion_info: Optional[dict] = None
model_config = { model_config = {
"arbitrary_types_allowed": True, "arbitrary_types_allowed": True,

View file

@ -2,6 +2,7 @@ import asyncio
from typing import Union from typing import Union
from uuid import NAMESPACE_OID, uuid5, UUID from uuid import NAMESPACE_OID, uuid5, UUID
from cognee.modules.ingestion.exceptions import IngestionError
from cognee.shared.logging_utils import get_logger from cognee.shared.logging_utils import get_logger
from cognee.modules.data.methods.get_dataset_data import get_dataset_data from cognee.modules.data.methods.get_dataset_data import get_dataset_data
from cognee.modules.data.models import Data, Dataset from cognee.modules.data.models import Data, Dataset
@ -26,6 +27,7 @@ from cognee.modules.data.methods import (
from cognee.modules.pipelines.models.PipelineRunInfo import ( from cognee.modules.pipelines.models.PipelineRunInfo import (
PipelineRunCompleted, PipelineRunCompleted,
PipelineRunStarted, PipelineRunStarted,
PipelineRunErrored,
) )
from cognee.infrastructure.databases.relational import ( from cognee.infrastructure.databases.relational import (

View file

@ -1,4 +1,6 @@
import os import os
import cognee.modules.ingestion as ingestion
from uuid import UUID from uuid import UUID
from typing import Any from typing import Any
from functools import wraps from functools import wraps
@ -6,9 +8,11 @@ from functools import wraps
from cognee.infrastructure.databases.relational import get_relational_engine from cognee.infrastructure.databases.relational import get_relational_engine
from cognee.modules.pipelines.operations.run_tasks_distributed import run_tasks_distributed from cognee.modules.pipelines.operations.run_tasks_distributed import run_tasks_distributed
from cognee.modules.users.models import User from cognee.modules.users.models import User
from cognee.modules.ingestion.methods import get_s3_fs, open_data_file
from cognee.shared.logging_utils import get_logger from cognee.shared.logging_utils import get_logger
from cognee.modules.users.methods import get_default_user from cognee.modules.users.methods import get_default_user
from cognee.modules.pipelines.utils import generate_pipeline_id from cognee.modules.pipelines.utils import generate_pipeline_id
from cognee.tasks.ingestion import save_data_item_to_storage, resolve_data_directories
from cognee.modules.pipelines.models.PipelineRunInfo import ( from cognee.modules.pipelines.models.PipelineRunInfo import (
PipelineRunCompleted, PipelineRunCompleted,
PipelineRunErrored, PipelineRunErrored,
@ -79,20 +83,67 @@ async def run_tasks(
payload=data, payload=data,
) )
fs = get_s3_fs()
data_items_pipeline_run_info = {}
ingestion_error = None
try: try:
async for result in run_tasks_with_telemetry( if not isinstance(data, list):
tasks=tasks, data = [data]
data=data, data = await resolve_data_directories(data)
user=user,
pipeline_name=pipeline_id, # TODO: Convert to async gather task instead of for loop (just make sure it can work there were some issues when async gathering datasets)
context=context, for data_item in data:
): file_path = await save_data_item_to_storage(data_item, dataset.name)
yield PipelineRunYield( # Ingest data and add metadata
pipeline_run_id=pipeline_run_id, with open_data_file(file_path, s3fs=fs) as file:
dataset_id=dataset.id, classified_data = ingestion.classify(file, s3fs=fs)
dataset_name=dataset.name, # data_id is the hash of file contents + owner id to avoid duplicate data
payload=result, data_id = ingestion.identify(classified_data, user)
)
try:
async for result in run_tasks_with_telemetry(
tasks=tasks,
data=data_item,
user=user,
pipeline_name=pipeline_id,
context=context,
):
yield PipelineRunYield(
pipeline_run_id=pipeline_run_id,
dataset_id=dataset.id,
dataset_name=dataset.name,
payload=result,
)
data_items_pipeline_run_info[data_id] = {
"run_info": PipelineRunCompleted(
pipeline_run_id=pipeline_run_id,
dataset_id=dataset.id,
dataset_name=dataset.name,
),
"data_id": data_id,
}
except Exception as error:
# Temporarily swallow error and try to process rest of documents first, then re-raise error at end of data ingestion pipeline
ingestion_error = error
logger.error(
f"Exception caught while processing data: {error}.\n Data processing failed for data item: {data_item}."
)
data_items_pipeline_run_info = {
"run_info": PipelineRunErrored(
pipeline_run_id=pipeline_run_id,
payload=error,
dataset_id=dataset.id,
dataset_name=dataset.name,
),
"data_id": data_id,
}
# re-raise error found during data ingestion
if ingestion_error:
raise ingestion_error
await log_pipeline_run_complete( await log_pipeline_run_complete(
pipeline_run_id, pipeline_id, pipeline_name, dataset_id, data pipeline_run_id, pipeline_id, pipeline_name, dataset_id, data
@ -102,6 +153,7 @@ async def run_tasks(
pipeline_run_id=pipeline_run_id, pipeline_run_id=pipeline_run_id,
dataset_id=dataset.id, dataset_id=dataset.id,
dataset_name=dataset.name, dataset_name=dataset.name,
data_ingestion_info=data_items_pipeline_run_info,
) )
except Exception as error: except Exception as error:
@ -114,6 +166,7 @@ async def run_tasks(
payload=error, payload=error,
dataset_id=dataset.id, dataset_id=dataset.id,
dataset_name=dataset.name, dataset_name=dataset.name,
data_ingestion_info=data_items_pipeline_run_info,
) )
raise error raise error

View file

@ -9,6 +9,7 @@ from cognee.modules.data.models import Data
from cognee.modules.users.models import User from cognee.modules.users.models import User
from cognee.modules.users.methods import get_default_user from cognee.modules.users.methods import get_default_user
from cognee.modules.users.permissions.methods import get_specific_user_permission_datasets from cognee.modules.users.permissions.methods import get_specific_user_permission_datasets
from cognee.modules.ingestion.methods import get_s3_fs, open_data_file
from cognee.modules.data.methods import ( from cognee.modules.data.methods import (
get_authorized_existing_datasets, get_authorized_existing_datasets,
get_dataset_data, get_dataset_data,
@ -18,9 +19,6 @@ from cognee.modules.data.methods import (
from .save_data_item_to_storage import save_data_item_to_storage from .save_data_item_to_storage import save_data_item_to_storage
from cognee.api.v1.add.config import get_s3_config
async def ingest_data( async def ingest_data(
data: Any, data: Any,
dataset_name: str, dataset_name: str,
@ -31,22 +29,7 @@ async def ingest_data(
if not user: if not user:
user = await get_default_user() user = await get_default_user()
s3_config = get_s3_config() fs = get_s3_fs()
fs = None
if s3_config.aws_access_key_id is not None and s3_config.aws_secret_access_key is not None:
import s3fs
fs = s3fs.S3FileSystem(
key=s3_config.aws_access_key_id, secret=s3_config.aws_secret_access_key, anon=False
)
def open_data_file(file_path: str):
if file_path.startswith("s3://"):
return fs.open(file_path, mode="rb")
else:
local_path = file_path.replace("file://", "")
return open(local_path, mode="rb")
def get_external_metadata_dict(data_item: Union[BinaryIO, str, Any]) -> dict[str, Any]: def get_external_metadata_dict(data_item: Union[BinaryIO, str, Any]) -> dict[str, Any]:
if hasattr(data_item, "dict") and inspect.ismethod(getattr(data_item, "dict")): if hasattr(data_item, "dict") and inspect.ismethod(getattr(data_item, "dict")):
@ -95,7 +78,7 @@ async def ingest_data(
file_path = await save_data_item_to_storage(data_item, dataset_name) file_path = await save_data_item_to_storage(data_item, dataset_name)
# Ingest data and add metadata # Ingest data and add metadata
with open_data_file(file_path) as file: with open_data_file(file_path, s3fs=fs) as file:
classified_data = ingestion.classify(file, s3fs=fs) classified_data = ingestion.classify(file, s3fs=fs)
# data_id is the hash of file contents + owner id to avoid duplicate data # data_id is the hash of file contents + owner id to avoid duplicate data