From bc82430fb50726cec4464074c9d06fe00bea2535 Mon Sep 17 00:00:00 2001 From: Leon Luithlen Date: Fri, 29 Nov 2024 14:36:03 +0100 Subject: [PATCH] Merge latest COG-519 --- .../files/utils/get_file_metadata.py | 1 + .../modules/data/operations/write_metadata.py | 21 ++++++++++++------- .../ingestion/ingest_data_with_metadata.py | 3 +-- 3 files changed, 15 insertions(+), 10 deletions(-) diff --git a/cognee/infrastructure/files/utils/get_file_metadata.py b/cognee/infrastructure/files/utils/get_file_metadata.py index 4aea9560e..a114ef48f 100644 --- a/cognee/infrastructure/files/utils/get_file_metadata.py +++ b/cognee/infrastructure/files/utils/get_file_metadata.py @@ -4,6 +4,7 @@ from .guess_file_type import guess_file_type class FileMetadata(TypedDict): name: str + file_path: str mime_type: str extension: str diff --git a/cognee/modules/data/operations/write_metadata.py b/cognee/modules/data/operations/write_metadata.py index 4b550a6bf..a2ea644ac 100644 --- a/cognee/modules/data/operations/write_metadata.py +++ b/cognee/modules/data/operations/write_metadata.py @@ -4,14 +4,15 @@ import re import warnings from typing import Any from uuid import UUID +from typing import Any, BinaryIO, Union from cognee.infrastructure.databases.relational import get_relational_engine - +from cognee.infrastructure.files.utils.get_file_metadata import FileMetadata from ..models.Metadata import Metadata -async def write_metadata(data_item: Any, data_id: UUID) -> UUID: - metadata_dict = get_metadata_dict(data_item) +async def write_metadata(data_item: Union[BinaryIO, str, Any], data_id: UUID, file_metadata: FileMetadata) -> UUID: + metadata_dict = get_metadata_dict(data_item, file_metadata) db_engine = get_relational_engine() async with db_engine.get_async_session() as session: metadata = Metadata( @@ -34,14 +35,18 @@ def parse_type(type_: Any) -> str: raise Exception(f"type: {type_} could not be parsed") -def get_metadata_dict(metadata: Any) -> dict[str, Any]: - if hasattr(metadata, "dict") and inspect.ismethod(getattr(metadata, "dict")): - return metadata.dict() +def get_metadata_dict(data_item: Union[BinaryIO, str, Any], file_metadata: FileMetadata) -> dict[str, Any]: + if isinstance(data_item, str): + return(file_metadata) + elif isinstance(data_item, BinaryIO): + return(file_metadata) + elif hasattr(data_item, "dict") and inspect.ismethod(getattr(data_item, "dict")): + return {**file_metadata, **data_item.dict()} else: warnings.warn( - f"metadata of type {type(metadata)}: {str(metadata)[:20]}... does not have dict method. Defaulting to string method" + f"metadata of type {type(data_item)}: {str(data_item)[:20]}... does not have dict method. Defaulting to string method" ) try: - return {"content": str(metadata)} + return {**dict(file_metadata), "content": str(data_item)} except Exception as e: raise Exception(f"Could not cast metadata to string: {e}") diff --git a/cognee/tasks/ingestion/ingest_data_with_metadata.py b/cognee/tasks/ingestion/ingest_data_with_metadata.py index 573e2c3c1..0c17b71f5 100644 --- a/cognee/tasks/ingestion/ingest_data_with_metadata.py +++ b/cognee/tasks/ingestion/ingest_data_with_metadata.py @@ -1,7 +1,6 @@ from typing import Any import dlt - import cognee.modules.ingestion as ingestion from cognee.infrastructure.databases.relational import get_relational_engine from cognee.modules.data.methods import create_dataset @@ -76,7 +75,7 @@ async def ingest_data_with_metadata(data: Any, dataset_name: str, user: User): dataset.data.append(data_point) await session.commit() - await write_metadata(data_item, data_point.id) + await write_metadata(data_item, data_point.id, file_metadata) yield {