74 lines
2.1 KiB
Python
74 lines
2.1 KiB
Python
import io
|
|
import os.path
|
|
from typing import BinaryIO, TypedDict
|
|
from pathlib import Path
|
|
|
|
from cognee.shared.logging_utils import get_logger
|
|
from cognee.infrastructure.files.utils.get_file_content_hash import get_file_content_hash
|
|
from .guess_file_type import guess_file_type
|
|
|
|
logger = get_logger("FileMetadata")
|
|
|
|
|
|
class FileMetadata(TypedDict):
|
|
"""
|
|
Represents metadata for a file.
|
|
|
|
This class defines a structure to store various attributes related to a file, including
|
|
its name, file path, MIME type, file extension, and a content hash for integrity
|
|
checking.
|
|
"""
|
|
|
|
name: str
|
|
file_path: str
|
|
mime_type: str
|
|
extension: str
|
|
content_hash: str
|
|
file_size: int
|
|
|
|
|
|
async def get_file_metadata(file: BinaryIO) -> FileMetadata:
|
|
"""
|
|
Retrieve metadata from a file object.
|
|
|
|
Reset the file pointer to the beginning of the file and compute the content hash. Guess
|
|
the file type and extract the file path and name. Construct and return a dictionary
|
|
containing the file's metadata attributes.
|
|
|
|
Parameters:
|
|
-----------
|
|
|
|
- file (BinaryIO): A file-like object from which to extract metadata.
|
|
|
|
Returns:
|
|
--------
|
|
|
|
- FileMetadata: A dictionary containing the file's name, path, MIME type, file
|
|
extension, and content hash.
|
|
"""
|
|
try:
|
|
file.seek(0)
|
|
content_hash = await get_file_content_hash(file)
|
|
file.seek(0)
|
|
except io.UnsupportedOperation as error:
|
|
logger.error(f"Error retrieving content hash for file: {file.name} \n{str(error)}\n\n")
|
|
|
|
file_type = guess_file_type(file)
|
|
|
|
file_path = getattr(file, "name", None) or getattr(file, "full_name", None)
|
|
file_name = Path(file_path).stem if file_path else None
|
|
|
|
# Get file size
|
|
pos = file.tell() # remember current pointer
|
|
file.seek(0, os.SEEK_END) # jump to end
|
|
file_size = file.tell() # byte count
|
|
file.seek(pos)
|
|
|
|
return FileMetadata(
|
|
name=file_name,
|
|
file_path=file_path,
|
|
mime_type=file_type.mime,
|
|
extension=file_type.extension,
|
|
content_hash=content_hash,
|
|
file_size=file_size,
|
|
)
|