feat: Data size info tracking (#1088)
<!-- .github/pull_request_template.md --> ## Description <!-- Provide a clear description of the changes in this PR --> ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin.
This commit is contained in:
parent
219db2f03d
commit
f68fd59b95
3 changed files with 15 additions and 0 deletions
|
|
@ -1,4 +1,5 @@
|
|||
import io
|
||||
import os.path
|
||||
from typing import BinaryIO, TypedDict
|
||||
|
||||
from cognee.shared.logging_utils import get_logger
|
||||
|
|
@ -22,6 +23,7 @@ class FileMetadata(TypedDict):
|
|||
mime_type: str
|
||||
extension: str
|
||||
content_hash: str
|
||||
file_size: int
|
||||
|
||||
|
||||
def get_file_metadata(file: BinaryIO) -> FileMetadata:
|
||||
|
|
@ -55,10 +57,17 @@ def get_file_metadata(file: BinaryIO) -> FileMetadata:
|
|||
file_path = getattr(file, "name", None) or getattr(file, "full_name", None)
|
||||
file_name = str(file_path).split("/")[-1].split(".")[0] if file_path else None
|
||||
|
||||
# Get file size
|
||||
pos = file.tell() # remember current pointer
|
||||
file.seek(0, os.SEEK_END) # jump to end
|
||||
file_size = file.tell() # byte count
|
||||
file.seek(pos)
|
||||
|
||||
return FileMetadata(
|
||||
name=file_name,
|
||||
file_path=file_path,
|
||||
mime_type=file_type.mime,
|
||||
extension=file_type.extension,
|
||||
content_hash=content_hash,
|
||||
file_size=file_size,
|
||||
)
|
||||
|
|
|
|||
|
|
@ -18,10 +18,12 @@ class Data(Base):
|
|||
mime_type = Column(String)
|
||||
raw_data_location = Column(String)
|
||||
owner_id = Column(UUID, index=True)
|
||||
tenant_id = Column(UUID, index=True, default=None)
|
||||
content_hash = Column(String)
|
||||
external_metadata = Column(JSON)
|
||||
node_set = Column(JSON, nullable=True) # Store NodeSet as JSON list of strings
|
||||
token_count = Column(Integer)
|
||||
data_size = Column(Integer) # File size in bytes
|
||||
created_at = Column(DateTime(timezone=True), default=lambda: datetime.now(timezone.utc))
|
||||
updated_at = Column(DateTime(timezone=True), onupdate=lambda: datetime.now(timezone.utc))
|
||||
|
||||
|
|
|
|||
|
|
@ -125,8 +125,10 @@ async def ingest_data(
|
|||
data_point.mime_type = file_metadata["mime_type"]
|
||||
data_point.owner_id = user.id
|
||||
data_point.content_hash = file_metadata["content_hash"]
|
||||
data_point.file_size = file_metadata["file_size"]
|
||||
data_point.external_metadata = ext_metadata
|
||||
data_point.node_set = json.dumps(node_set) if node_set else None
|
||||
data_point.tenant_id = user.tenant_id if user.tenant_id else None
|
||||
|
||||
# Check if data is already in dataset
|
||||
if str(data_point.id) in dataset_data_map:
|
||||
|
|
@ -148,6 +150,8 @@ async def ingest_data(
|
|||
content_hash=file_metadata["content_hash"],
|
||||
external_metadata=ext_metadata,
|
||||
node_set=json.dumps(node_set) if node_set else None,
|
||||
data_size=file_metadata["file_size"],
|
||||
tenant_id=user.tenant_id if user.tenant_id else None,
|
||||
token_count=-1,
|
||||
)
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue