feat: Data size info tracking (#1088)

<!-- .github/pull_request_template.md -->

## Description
<!-- Provide a clear description of the changes in this PR -->

## DCO Affirmation
I affirm that all code in every commit of this pull request conforms to
the terms of the Topoteretes Developer Certificate of Origin.
This commit is contained in:
Igor Ilic 2025-07-14 19:03:58 +02:00 committed by GitHub
parent 219db2f03d
commit f68fd59b95
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 15 additions and 0 deletions

View file

@ -1,4 +1,5 @@
import io
import os.path
from typing import BinaryIO, TypedDict
from cognee.shared.logging_utils import get_logger
@ -22,6 +23,7 @@ class FileMetadata(TypedDict):
mime_type: str
extension: str
content_hash: str
file_size: int
def get_file_metadata(file: BinaryIO) -> FileMetadata:
@ -55,10 +57,17 @@ def get_file_metadata(file: BinaryIO) -> FileMetadata:
file_path = getattr(file, "name", None) or getattr(file, "full_name", None)
file_name = str(file_path).split("/")[-1].split(".")[0] if file_path else None
# Get file size
pos = file.tell() # remember current pointer
file.seek(0, os.SEEK_END) # jump to end
file_size = file.tell() # byte count
file.seek(pos)
return FileMetadata(
name=file_name,
file_path=file_path,
mime_type=file_type.mime,
extension=file_type.extension,
content_hash=content_hash,
file_size=file_size,
)

View file

@ -18,10 +18,12 @@ class Data(Base):
mime_type = Column(String)
raw_data_location = Column(String)
owner_id = Column(UUID, index=True)
tenant_id = Column(UUID, index=True, default=None)
content_hash = Column(String)
external_metadata = Column(JSON)
node_set = Column(JSON, nullable=True) # Store NodeSet as JSON list of strings
token_count = Column(Integer)
data_size = Column(Integer) # File size in bytes
created_at = Column(DateTime(timezone=True), default=lambda: datetime.now(timezone.utc))
updated_at = Column(DateTime(timezone=True), onupdate=lambda: datetime.now(timezone.utc))

View file

@ -125,8 +125,10 @@ async def ingest_data(
data_point.mime_type = file_metadata["mime_type"]
data_point.owner_id = user.id
data_point.content_hash = file_metadata["content_hash"]
data_point.file_size = file_metadata["file_size"]
data_point.external_metadata = ext_metadata
data_point.node_set = json.dumps(node_set) if node_set else None
data_point.tenant_id = user.tenant_id if user.tenant_id else None
# Check if data is already in dataset
if str(data_point.id) in dataset_data_map:
@ -148,6 +150,8 @@ async def ingest_data(
content_hash=file_metadata["content_hash"],
external_metadata=ext_metadata,
node_set=json.dumps(node_set) if node_set else None,
data_size=file_metadata["file_size"],
tenant_id=user.tenant_id if user.tenant_id else None,
token_count=-1,
)