cognee/cognee/modules/data/models/Data.py
Igor Ilic 14ba3e8829
feat: Enable async execution of data items for incremental loading (#1092)
<!-- .github/pull_request_template.md -->

## Description
Attempt at making incremental loading run async

## DCO Affirmation
I affirm that all code in every commit of this pull request conforms to
the terms of the Topoteretes Developer Certificate of Origin.
2025-07-29 10:39:31 -04:00

54 lines
2 KiB
Python

from datetime import datetime, timezone
from uuid import uuid4
from sqlalchemy import UUID, Column, DateTime, String, JSON, Integer
from sqlalchemy.ext.mutable import MutableDict
from sqlalchemy.orm import relationship
from cognee.infrastructure.databases.relational import Base
from .DatasetData import DatasetData
class Data(Base):
__tablename__ = "data"
id = Column(UUID, primary_key=True, default=uuid4)
name = Column(String)
extension = Column(String)
mime_type = Column(String)
raw_data_location = Column(String)
owner_id = Column(UUID, index=True)
tenant_id = Column(UUID, index=True, nullable=True)
content_hash = Column(String)
external_metadata = Column(JSON)
# Store NodeSet as JSON list of strings
node_set = Column(JSON, nullable=True)
# MutableDict allows SQLAlchemy to notice key-value pair changes, without it changing a value for a key
# wouldn't be noticed when commiting a database session
pipeline_status = Column(MutableDict.as_mutable(JSON))
token_count = Column(Integer)
data_size = Column(Integer, nullable=True) # File size in bytes
created_at = Column(DateTime(timezone=True), default=lambda: datetime.now(timezone.utc))
updated_at = Column(DateTime(timezone=True), onupdate=lambda: datetime.now(timezone.utc))
datasets = relationship(
"Dataset",
secondary=DatasetData.__tablename__,
back_populates="data",
lazy="noload",
cascade="all, delete",
)
def to_json(self) -> dict:
return {
"id": str(self.id),
"name": self.name,
"extension": self.extension,
"mimeType": self.mime_type,
"rawDataLocation": self.raw_data_location,
"createdAt": self.created_at.isoformat(),
"updatedAt": self.updated_at.isoformat() if self.updated_at else None,
"nodeSet": self.node_set,
# "datasets": [dataset.to_json() for dataset in self.datasets]
}