diff --git a/cognee/infrastructure/files/utils/get_file_metadata.py b/cognee/infrastructure/files/utils/get_file_metadata.py index ead226650..ebdd7cd05 100644 --- a/cognee/infrastructure/files/utils/get_file_metadata.py +++ b/cognee/infrastructure/files/utils/get_file_metadata.py @@ -1,4 +1,5 @@ import io +import os.path from typing import BinaryIO, TypedDict from cognee.shared.logging_utils import get_logger @@ -22,6 +23,7 @@ class FileMetadata(TypedDict): mime_type: str extension: str content_hash: str + file_size: int def get_file_metadata(file: BinaryIO) -> FileMetadata: @@ -55,10 +57,17 @@ def get_file_metadata(file: BinaryIO) -> FileMetadata: file_path = getattr(file, "name", None) or getattr(file, "full_name", None) file_name = str(file_path).split("/")[-1].split(".")[0] if file_path else None + # Get file size + pos = file.tell() # remember current pointer + file.seek(0, os.SEEK_END) # jump to end + file_size = file.tell() # byte count + file.seek(pos) + return FileMetadata( name=file_name, file_path=file_path, mime_type=file_type.mime, extension=file_type.extension, content_hash=content_hash, + file_size=file_size, ) diff --git a/cognee/modules/data/models/Data.py b/cognee/modules/data/models/Data.py index 49ab28271..0ae2efa94 100644 --- a/cognee/modules/data/models/Data.py +++ b/cognee/modules/data/models/Data.py @@ -18,10 +18,12 @@ class Data(Base): mime_type = Column(String) raw_data_location = Column(String) owner_id = Column(UUID, index=True) + tenant_id = Column(UUID, index=True, default=None) content_hash = Column(String) external_metadata = Column(JSON) node_set = Column(JSON, nullable=True) # Store NodeSet as JSON list of strings token_count = Column(Integer) + data_size = Column(Integer) # File size in bytes created_at = Column(DateTime(timezone=True), default=lambda: datetime.now(timezone.utc)) updated_at = Column(DateTime(timezone=True), onupdate=lambda: datetime.now(timezone.utc)) diff --git a/cognee/tasks/ingestion/ingest_data.py b/cognee/tasks/ingestion/ingest_data.py index f3fa43b62..48120b489 100644 --- a/cognee/tasks/ingestion/ingest_data.py +++ b/cognee/tasks/ingestion/ingest_data.py @@ -125,8 +125,10 @@ async def ingest_data( data_point.mime_type = file_metadata["mime_type"] data_point.owner_id = user.id data_point.content_hash = file_metadata["content_hash"] + data_point.file_size = file_metadata["file_size"] data_point.external_metadata = ext_metadata data_point.node_set = json.dumps(node_set) if node_set else None + data_point.tenant_id = user.tenant_id if user.tenant_id else None # Check if data is already in dataset if str(data_point.id) in dataset_data_map: @@ -148,6 +150,8 @@ async def ingest_data( content_hash=file_metadata["content_hash"], external_metadata=ext_metadata, node_set=json.dumps(node_set) if node_set else None, + data_size=file_metadata["file_size"], + tenant_id=user.tenant_id if user.tenant_id else None, token_count=-1, )