feat: incremental load initial commit

This commit is contained in:
Igor Ilic 2025-07-10 20:46:36 +02:00
parent dad1e27052
commit af2b0735f6
9 changed files with 96 additions and 33 deletions

View file

@ -242,6 +242,7 @@ async def run_cognify_as_background_process(
async def handle_rest_of_the_run(pipeline_list):
# Execute all provided pipelines one by one to avoid database write conflicts
# TODO: Convert to async gather task instead of for loop when Queue mechanism for database is created
for pipeline in pipeline_list:
while True:
try:

View file

@ -24,6 +24,7 @@ class Data(Base):
token_count = Column(Integer)
created_at = Column(DateTime(timezone=True), default=lambda: datetime.now(timezone.utc))
updated_at = Column(DateTime(timezone=True), onupdate=lambda: datetime.now(timezone.utc))
pipeline_status = Column(JSON)
datasets = relationship(
"Dataset",

View file

@ -0,0 +1,2 @@
from .get_s3_fs import get_s3_fs
from .open_data_file import open_data_file

View file

@ -0,0 +1,14 @@
from cognee.api.v1.add.config import get_s3_config
def get_s3_fs():
s3_config = get_s3_config()
fs = None
if s3_config.aws_access_key_id is not None and s3_config.aws_secret_access_key is not None:
import s3fs
fs = s3fs.S3FileSystem(
key=s3_config.aws_access_key_id, secret=s3_config.aws_secret_access_key, anon=False
)
return fs

View file

@ -0,0 +1,6 @@
def open_data_file(file_path: str, s3fs):
if file_path.startswith("s3://"):
return s3fs.open(file_path, mode="rb")
else:
local_path = file_path.replace("file://", "")
return open(local_path, mode="rb")

View file

@ -9,6 +9,7 @@ class PipelineRunInfo(BaseModel):
dataset_id: UUID
dataset_name: str
payload: Optional[Any] = None
data_ingestion_info: Optional[dict] = None
model_config = {
"arbitrary_types_allowed": True,

View file

@ -2,6 +2,7 @@ import asyncio
from typing import Union
from uuid import NAMESPACE_OID, uuid5, UUID
from cognee.modules.ingestion.exceptions import IngestionError
from cognee.shared.logging_utils import get_logger
from cognee.modules.data.methods.get_dataset_data import get_dataset_data
from cognee.modules.data.models import Data, Dataset
@ -26,6 +27,7 @@ from cognee.modules.data.methods import (
from cognee.modules.pipelines.models.PipelineRunInfo import (
PipelineRunCompleted,
PipelineRunStarted,
PipelineRunErrored,
)
from cognee.infrastructure.databases.relational import (

View file

@ -1,4 +1,6 @@
import os
import cognee.modules.ingestion as ingestion
from uuid import UUID
from typing import Any
from functools import wraps
@ -6,9 +8,11 @@ from functools import wraps
from cognee.infrastructure.databases.relational import get_relational_engine
from cognee.modules.pipelines.operations.run_tasks_distributed import run_tasks_distributed
from cognee.modules.users.models import User
from cognee.modules.ingestion.methods import get_s3_fs, open_data_file
from cognee.shared.logging_utils import get_logger
from cognee.modules.users.methods import get_default_user
from cognee.modules.pipelines.utils import generate_pipeline_id
from cognee.tasks.ingestion import save_data_item_to_storage, resolve_data_directories
from cognee.modules.pipelines.models.PipelineRunInfo import (
PipelineRunCompleted,
PipelineRunErrored,
@ -79,20 +83,67 @@ async def run_tasks(
payload=data,
)
fs = get_s3_fs()
data_items_pipeline_run_info = {}
ingestion_error = None
try:
async for result in run_tasks_with_telemetry(
tasks=tasks,
data=data,
user=user,
pipeline_name=pipeline_id,
context=context,
):
yield PipelineRunYield(
pipeline_run_id=pipeline_run_id,
dataset_id=dataset.id,
dataset_name=dataset.name,
payload=result,
)
if not isinstance(data, list):
data = [data]
data = await resolve_data_directories(data)
# TODO: Convert to async gather task instead of for loop (just make sure it can work there were some issues when async gathering datasets)
for data_item in data:
file_path = await save_data_item_to_storage(data_item, dataset.name)
# Ingest data and add metadata
with open_data_file(file_path, s3fs=fs) as file:
classified_data = ingestion.classify(file, s3fs=fs)
# data_id is the hash of file contents + owner id to avoid duplicate data
data_id = ingestion.identify(classified_data, user)
try:
async for result in run_tasks_with_telemetry(
tasks=tasks,
data=data_item,
user=user,
pipeline_name=pipeline_id,
context=context,
):
yield PipelineRunYield(
pipeline_run_id=pipeline_run_id,
dataset_id=dataset.id,
dataset_name=dataset.name,
payload=result,
)
data_items_pipeline_run_info[data_id] = {
"run_info": PipelineRunCompleted(
pipeline_run_id=pipeline_run_id,
dataset_id=dataset.id,
dataset_name=dataset.name,
),
"data_id": data_id,
}
except Exception as error:
# Temporarily swallow error and try to process rest of documents first, then re-raise error at end of data ingestion pipeline
ingestion_error = error
logger.error(
f"Exception caught while processing data: {error}.\n Data processing failed for data item: {data_item}."
)
data_items_pipeline_run_info = {
"run_info": PipelineRunErrored(
pipeline_run_id=pipeline_run_id,
payload=error,
dataset_id=dataset.id,
dataset_name=dataset.name,
),
"data_id": data_id,
}
# re-raise error found during data ingestion
if ingestion_error:
raise ingestion_error
await log_pipeline_run_complete(
pipeline_run_id, pipeline_id, pipeline_name, dataset_id, data
@ -102,6 +153,7 @@ async def run_tasks(
pipeline_run_id=pipeline_run_id,
dataset_id=dataset.id,
dataset_name=dataset.name,
data_ingestion_info=data_items_pipeline_run_info,
)
except Exception as error:
@ -114,6 +166,7 @@ async def run_tasks(
payload=error,
dataset_id=dataset.id,
dataset_name=dataset.name,
data_ingestion_info=data_items_pipeline_run_info,
)
raise error

View file

@ -9,6 +9,7 @@ from cognee.modules.data.models import Data
from cognee.modules.users.models import User
from cognee.modules.users.methods import get_default_user
from cognee.modules.users.permissions.methods import get_specific_user_permission_datasets
from cognee.modules.ingestion.methods import get_s3_fs, open_data_file
from cognee.modules.data.methods import (
get_authorized_existing_datasets,
get_dataset_data,
@ -18,9 +19,6 @@ from cognee.modules.data.methods import (
from .save_data_item_to_storage import save_data_item_to_storage
from cognee.api.v1.add.config import get_s3_config
async def ingest_data(
data: Any,
dataset_name: str,
@ -31,22 +29,7 @@ async def ingest_data(
if not user:
user = await get_default_user()
s3_config = get_s3_config()
fs = None
if s3_config.aws_access_key_id is not None and s3_config.aws_secret_access_key is not None:
import s3fs
fs = s3fs.S3FileSystem(
key=s3_config.aws_access_key_id, secret=s3_config.aws_secret_access_key, anon=False
)
def open_data_file(file_path: str):
if file_path.startswith("s3://"):
return fs.open(file_path, mode="rb")
else:
local_path = file_path.replace("file://", "")
return open(local_path, mode="rb")
fs = get_s3_fs()
def get_external_metadata_dict(data_item: Union[BinaryIO, str, Any]) -> dict[str, Any]:
if hasattr(data_item, "dict") and inspect.ismethod(getattr(data_item, "dict")):
@ -95,7 +78,7 @@ async def ingest_data(
file_path = await save_data_item_to_storage(data_item, dataset_name)
# Ingest data and add metadata
with open_data_file(file_path) as file:
with open_data_file(file_path, s3fs=fs) as file:
classified_data = ingestion.classify(file, s3fs=fs)
# data_id is the hash of file contents + owner id to avoid duplicate data