cognee/cognitive_architecture/infrastructure/files/utils/get_file_metadata.py
2024-03-12 13:42:51 +01:00

44 lines
1.1 KiB
Python

from typing import BinaryIO, TypedDict
import filetype
from unstructured.cleaners.core import clean
from unstructured.partition.pdf import partition_pdf
from .extract_keywords import extract_keywords
class FileTypeException(Exception):
message: str
def __init__(self, message: str):
self.message = message
class FileMetadata(TypedDict):
name: str
mime_type: str
extension: str
keywords: list[str]
def get_file_metadata(file: BinaryIO) -> FileMetadata:
file_type = filetype.guess(file)
if file_type is None:
raise FileTypeException("Unknown file detected.")
keywords: list = []
if file_type.extension == "pdf":
elements = partition_pdf(file = file, strategy = "fast")
keywords = extract_keywords(
"\n".join(map(lambda element: clean(element.text), elements))
)
file_path = file.name
file_name = file_path.split("/")[-1].split(".")[0]
return FileMetadata(
name = file_name,
file_path = file_path,
mime_type = file_type.mime,
extension = file_type.extension,
keywords = keywords
)