diff --git a/cognee/infrastructure/files/utils/get_file_metadata.py b/cognee/infrastructure/files/utils/get_file_metadata.py index 23b10a6df..debc7bbaf 100644 --- a/cognee/infrastructure/files/utils/get_file_metadata.py +++ b/cognee/infrastructure/files/utils/get_file_metadata.py @@ -1,6 +1,6 @@ import io import os.path -from typing import BinaryIO, TypedDict +from typing import BinaryIO, TypedDict, Optional from pathlib import Path from cognee.shared.logging_utils import get_logger @@ -27,7 +27,7 @@ class FileMetadata(TypedDict): file_size: int -async def get_file_metadata(file: BinaryIO) -> FileMetadata: +async def get_file_metadata(file: BinaryIO, name: Optional[str] = None) -> FileMetadata: """ Retrieve metadata from a file object. @@ -53,15 +53,15 @@ async def get_file_metadata(file: BinaryIO) -> FileMetadata: except io.UnsupportedOperation as error: logger.error(f"Error retrieving content hash for file: {file.name} \n{str(error)}\n\n") - file_type = guess_file_type(file) + file_type = guess_file_type(file, name=name) file_path = getattr(file, "name", None) or getattr(file, "full_name", None) if isinstance(file_path, str): file_name = Path(file_path).stem if file_path else None else: - # In case file_path does not exist or is a integer return None - file_name = None + # In case file_path does not exist try file_name + file_name = name # Get file size pos = file.tell() # remember current pointer diff --git a/cognee/infrastructure/files/utils/guess_file_type.py b/cognee/infrastructure/files/utils/guess_file_type.py index 4e3ff6824..78b20c93d 100644 --- a/cognee/infrastructure/files/utils/guess_file_type.py +++ b/cognee/infrastructure/files/utils/guess_file_type.py @@ -1,6 +1,9 @@ -from typing import BinaryIO +import io +from pathlib import Path +from typing import BinaryIO, Optional, Any import filetype -from .is_text_content import is_text_content +from tempfile import SpooledTemporaryFile +from filetype.types.base import Type class FileTypeException(Exception): @@ -22,7 +25,7 @@ class FileTypeException(Exception): self.message = message -def guess_file_type(file: BinaryIO) -> filetype.Type: +def guess_file_type(file: BinaryIO, name: Optional[str] = None) -> filetype.Type: """ Guess the file type from the given binary file stream. @@ -39,12 +42,23 @@ def guess_file_type(file: BinaryIO) -> filetype.Type: - filetype.Type: The guessed file type, represented as filetype.Type. """ + + # Note: If file has .txt or .text extension, consider it a plain text file as filetype.guess may not detect it properly + # as it contains no magic number encoding + ext = None + if isinstance(file, str): + ext = Path(file).suffix + elif name is not None: + ext = Path(name).suffix + + if ext in [".txt", ".text"]: + file_type = Type("text/plain", "txt") + return file_type + file_type = filetype.guess(file) # If file type could not be determined consider it a plain text file as they don't have magic number encoding if file_type is None: - from filetype.types.base import Type - file_type = Type("text/plain", "txt") if file_type is None: diff --git a/cognee/modules/ingestion/data_types/BinaryData.py b/cognee/modules/ingestion/data_types/BinaryData.py index f96e0d65c..9448dddcf 100644 --- a/cognee/modules/ingestion/data_types/BinaryData.py +++ b/cognee/modules/ingestion/data_types/BinaryData.py @@ -30,7 +30,7 @@ class BinaryData(IngestionData): async def ensure_metadata(self): if self.metadata is None: - self.metadata = await get_file_metadata(self.data) + self.metadata = await get_file_metadata(self.data, name=self.name) if self.metadata["name"] is None: self.metadata["name"] = self.name