diff --git a/cognee/infrastructure/files/utils/get_file_metadata.py b/cognee/infrastructure/files/utils/get_file_metadata.py index a9528e5c0..89c3d6d8e 100644 --- a/cognee/infrastructure/files/utils/get_file_metadata.py +++ b/cognee/infrastructure/files/utils/get_file_metadata.py @@ -1,6 +1,7 @@ from typing import BinaryIO, TypedDict import hashlib from .guess_file_type import guess_file_type +from cognee.shared.utils import get_file_content_hash class FileMetadata(TypedDict): @@ -13,7 +14,7 @@ class FileMetadata(TypedDict): def get_file_metadata(file: BinaryIO) -> FileMetadata: """Get metadata from a file""" file.seek(0) - content_hash = hashlib.md5(file.read()).hexdigest() + content_hash = get_file_content_hash(file) file.seek(0) file_type = guess_file_type(file) diff --git a/cognee/shared/utils.py b/cognee/shared/utils.py index 315e234f1..1dc52acd5 100644 --- a/cognee/shared/utils.py +++ b/cognee/shared/utils.py @@ -1,6 +1,9 @@ """ This module contains utility functions for the cognee. """ import os +from typing import BinaryIO, Union + import requests +import hashlib from datetime import datetime, timezone import graphistry import networkx as nx @@ -70,6 +73,26 @@ def num_tokens_from_string(string: str, encoding_name: str) -> int: num_tokens = len(encoding.encode(string)) return num_tokens +def get_file_content_hash(file_obj: Union[str, BinaryIO]) -> str: + h = hashlib.md5() + + if isinstance(file_obj, str): + with open(file_obj, 'rb') as file: + while True: + # Reading is buffered, so we can read smaller chunks. + chunk = file.read(h.block_size) + if not chunk: + break + h.update(chunk) + else: + while True: + # Reading is buffered, so we can read smaller chunks. + chunk = file_obj.read(h.block_size) + if not chunk: + break + h.update(chunk) + + return h.hexdigest() def trim_text_to_max_tokens(text: str, max_tokens: int, encoding_name: str) -> str: """ diff --git a/cognee/tasks/ingestion/ingest_data_with_metadata.py b/cognee/tasks/ingestion/ingest_data_with_metadata.py index 7a70881e2..c6b42f482 100644 --- a/cognee/tasks/ingestion/ingest_data_with_metadata.py +++ b/cognee/tasks/ingestion/ingest_data_with_metadata.py @@ -1,4 +1,4 @@ -from typing import Any +from typing import Any, List import dlt import cognee.modules.ingestion as ingestion @@ -24,7 +24,7 @@ async def ingest_data_with_metadata(data: Any, dataset_name: str, user: User): ) @dlt.resource(standalone=True, primary_key="id", merge_key="id") - async def data_resources(file_paths: str, user: User): + async def data_resources(file_paths: List[str], user: User): for file_path in file_paths: with open(file_path.replace("file://", ""), mode="rb") as file: classified_data = ingestion.classify(file)