feat: Data size info tracking (#1088)
<!-- .github/pull_request_template.md --> ## Description <!-- Provide a clear description of the changes in this PR --> ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin.
This commit is contained in:
parent
219db2f03d
commit
f68fd59b95
3 changed files with 15 additions and 0 deletions
|
|
@ -1,4 +1,5 @@
|
||||||
import io
|
import io
|
||||||
|
import os.path
|
||||||
from typing import BinaryIO, TypedDict
|
from typing import BinaryIO, TypedDict
|
||||||
|
|
||||||
from cognee.shared.logging_utils import get_logger
|
from cognee.shared.logging_utils import get_logger
|
||||||
|
|
@ -22,6 +23,7 @@ class FileMetadata(TypedDict):
|
||||||
mime_type: str
|
mime_type: str
|
||||||
extension: str
|
extension: str
|
||||||
content_hash: str
|
content_hash: str
|
||||||
|
file_size: int
|
||||||
|
|
||||||
|
|
||||||
def get_file_metadata(file: BinaryIO) -> FileMetadata:
|
def get_file_metadata(file: BinaryIO) -> FileMetadata:
|
||||||
|
|
@ -55,10 +57,17 @@ def get_file_metadata(file: BinaryIO) -> FileMetadata:
|
||||||
file_path = getattr(file, "name", None) or getattr(file, "full_name", None)
|
file_path = getattr(file, "name", None) or getattr(file, "full_name", None)
|
||||||
file_name = str(file_path).split("/")[-1].split(".")[0] if file_path else None
|
file_name = str(file_path).split("/")[-1].split(".")[0] if file_path else None
|
||||||
|
|
||||||
|
# Get file size
|
||||||
|
pos = file.tell() # remember current pointer
|
||||||
|
file.seek(0, os.SEEK_END) # jump to end
|
||||||
|
file_size = file.tell() # byte count
|
||||||
|
file.seek(pos)
|
||||||
|
|
||||||
return FileMetadata(
|
return FileMetadata(
|
||||||
name=file_name,
|
name=file_name,
|
||||||
file_path=file_path,
|
file_path=file_path,
|
||||||
mime_type=file_type.mime,
|
mime_type=file_type.mime,
|
||||||
extension=file_type.extension,
|
extension=file_type.extension,
|
||||||
content_hash=content_hash,
|
content_hash=content_hash,
|
||||||
|
file_size=file_size,
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -18,10 +18,12 @@ class Data(Base):
|
||||||
mime_type = Column(String)
|
mime_type = Column(String)
|
||||||
raw_data_location = Column(String)
|
raw_data_location = Column(String)
|
||||||
owner_id = Column(UUID, index=True)
|
owner_id = Column(UUID, index=True)
|
||||||
|
tenant_id = Column(UUID, index=True, default=None)
|
||||||
content_hash = Column(String)
|
content_hash = Column(String)
|
||||||
external_metadata = Column(JSON)
|
external_metadata = Column(JSON)
|
||||||
node_set = Column(JSON, nullable=True) # Store NodeSet as JSON list of strings
|
node_set = Column(JSON, nullable=True) # Store NodeSet as JSON list of strings
|
||||||
token_count = Column(Integer)
|
token_count = Column(Integer)
|
||||||
|
data_size = Column(Integer) # File size in bytes
|
||||||
created_at = Column(DateTime(timezone=True), default=lambda: datetime.now(timezone.utc))
|
created_at = Column(DateTime(timezone=True), default=lambda: datetime.now(timezone.utc))
|
||||||
updated_at = Column(DateTime(timezone=True), onupdate=lambda: datetime.now(timezone.utc))
|
updated_at = Column(DateTime(timezone=True), onupdate=lambda: datetime.now(timezone.utc))
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -125,8 +125,10 @@ async def ingest_data(
|
||||||
data_point.mime_type = file_metadata["mime_type"]
|
data_point.mime_type = file_metadata["mime_type"]
|
||||||
data_point.owner_id = user.id
|
data_point.owner_id = user.id
|
||||||
data_point.content_hash = file_metadata["content_hash"]
|
data_point.content_hash = file_metadata["content_hash"]
|
||||||
|
data_point.file_size = file_metadata["file_size"]
|
||||||
data_point.external_metadata = ext_metadata
|
data_point.external_metadata = ext_metadata
|
||||||
data_point.node_set = json.dumps(node_set) if node_set else None
|
data_point.node_set = json.dumps(node_set) if node_set else None
|
||||||
|
data_point.tenant_id = user.tenant_id if user.tenant_id else None
|
||||||
|
|
||||||
# Check if data is already in dataset
|
# Check if data is already in dataset
|
||||||
if str(data_point.id) in dataset_data_map:
|
if str(data_point.id) in dataset_data_map:
|
||||||
|
|
@ -148,6 +150,8 @@ async def ingest_data(
|
||||||
content_hash=file_metadata["content_hash"],
|
content_hash=file_metadata["content_hash"],
|
||||||
external_metadata=ext_metadata,
|
external_metadata=ext_metadata,
|
||||||
node_set=json.dumps(node_set) if node_set else None,
|
node_set=json.dumps(node_set) if node_set else None,
|
||||||
|
data_size=file_metadata["file_size"],
|
||||||
|
tenant_id=user.tenant_id if user.tenant_id else None,
|
||||||
token_count=-1,
|
token_count=-1,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue