diff --git a/cognee/api/v1/add/add.py b/cognee/api/v1/add/add.py index 10430ed8d..39ee01964 100644 --- a/cognee/api/v1/add/add.py +++ b/cognee/api/v1/add/add.py @@ -33,11 +33,11 @@ async def add(data: Union[BinaryIO, List[BinaryIO], str, List[str]], dataset_nam # data is text else: - file_path = save_data_to_file(data, dataset_name) + file_path = save_data_to_file(data) return await add([file_path], dataset_name) if hasattr(data, "file"): - file_path = save_data_to_file(data.file, dataset_name, filename = data.filename) + file_path = save_data_to_file(data.file, filename = data.filename) return await add([file_path], dataset_name) # data is a list of file paths or texts @@ -45,13 +45,13 @@ async def add(data: Union[BinaryIO, List[BinaryIO], str, List[str]], dataset_nam for data_item in data: if hasattr(data_item, "file"): - file_paths.append(save_data_to_file(data_item, dataset_name, filename = data_item.filename)) + file_paths.append(save_data_to_file(data_item, filename = data_item.filename)) elif isinstance(data_item, str) and ( data_item.startswith("/") or data_item.startswith("file://") ): file_paths.append(data_item) elif isinstance(data_item, str): - file_paths.append(save_data_to_file(data_item, dataset_name)) + file_paths.append(save_data_to_file(data_item)) if len(file_paths) > 0: return await add_files(file_paths, dataset_name, user) diff --git a/cognee/modules/ingestion/save_data_to_file.py b/cognee/modules/ingestion/save_data_to_file.py index 1bbfaec37..1af6ab0aa 100644 --- a/cognee/modules/ingestion/save_data_to_file.py +++ b/cognee/modules/ingestion/save_data_to_file.py @@ -1,25 +1,28 @@ -import string -import random +import os.path +import hashlib from typing import BinaryIO, Union from cognee.base_config import get_base_config from cognee.infrastructure.files.storage import LocalStorage from .classify import classify -def save_data_to_file(data: Union[str, BinaryIO], dataset_name: str, filename: str = None): +def save_data_to_file(data: Union[str, BinaryIO], filename: str = None): base_config = get_base_config() data_directory_path = base_config.data_root_directory classified_data = classify(data, filename) - storage_path = data_directory_path + "/" + dataset_name.replace(".", "/") + storage_path = os.path.join(data_directory_path, "data") LocalStorage.ensure_directory_exists(storage_path) file_metadata = classified_data.get_metadata() if "name" not in file_metadata or file_metadata["name"] is None: - letters = string.ascii_lowercase - random_string = "".join(random.choice(letters) for _ in range(32)) - file_metadata["name"] = "text_" + random_string + ".txt" + data_contents = classified_data.get_data().encode('utf-8') + hash_contents = hashlib.md5(data_contents).hexdigest() + file_metadata["name"] = "text_" + hash_contents + ".txt" file_name = file_metadata["name"] - LocalStorage(storage_path).store(file_name, classified_data.get_data()) + + # Don't save file if it already exists + if not os.path.isfile(os.path.join(storage_path, file_name)): + LocalStorage(storage_path).store(file_name, classified_data.get_data()) return "file://" + storage_path + "/" + file_name diff --git a/cognee/tasks/ingestion/ingest_data_with_metadata.py b/cognee/tasks/ingestion/ingest_data_with_metadata.py index 7c0b840eb..59e4a16f3 100644 --- a/cognee/tasks/ingestion/ingest_data_with_metadata.py +++ b/cognee/tasks/ingestion/ingest_data_with_metadata.py @@ -90,8 +90,6 @@ async def ingest_data_with_metadata(data: Any, dataset_name: str, user: User): mime_type = file_metadata["mime_type"] ) - - # Check if data is already in dataset dataset_data = ( await session.execute(select(DatasetData).filter(DatasetData.data_id == data_id, diff --git a/cognee/tasks/ingestion/save_data_item_to_storage.py b/cognee/tasks/ingestion/save_data_item_to_storage.py index e2a7c8ee7..88d499e74 100644 --- a/cognee/tasks/ingestion/save_data_item_to_storage.py +++ b/cognee/tasks/ingestion/save_data_item_to_storage.py @@ -7,7 +7,7 @@ def save_data_item_to_storage(data_item: Union[BinaryIO, str], dataset_name: str # data is a file object coming from upload. if hasattr(data_item, "file"): - file_path = save_data_to_file(data_item.file, dataset_name, filename=data_item.filename) + file_path = save_data_to_file(data_item.file, filename=data_item.filename) elif isinstance(data_item, str): # data is a file path @@ -15,7 +15,7 @@ def save_data_item_to_storage(data_item: Union[BinaryIO, str], dataset_name: str file_path = data_item.replace("file://", "") # data is text else: - file_path = save_data_to_file(data_item, dataset_name) + file_path = save_data_to_file(data_item) else: raise IngestionError(message=f"Data type not supported: {type(data_item)}") diff --git a/cognee/tasks/ingestion/save_data_item_with_metadata_to_storage.py b/cognee/tasks/ingestion/save_data_item_with_metadata_to_storage.py index d758ebcd1..06dde11bd 100644 --- a/cognee/tasks/ingestion/save_data_item_with_metadata_to_storage.py +++ b/cognee/tasks/ingestion/save_data_item_with_metadata_to_storage.py @@ -17,7 +17,7 @@ async def save_data_item_with_metadata_to_storage( # data is a file object coming from upload. elif hasattr(data_item, "file"): file_path = save_data_to_file( - data_item.file, dataset_name, filename=data_item.filename + data_item.file, filename=data_item.filename ) elif isinstance(data_item, str): @@ -26,7 +26,7 @@ async def save_data_item_with_metadata_to_storage( file_path = data_item.replace("file://", "") # data is text else: - file_path = save_data_to_file(data_item, dataset_name) + file_path = save_data_to_file(data_item) else: raise IngestionError(message=f"Data type not supported: {type(data_item)}") diff --git a/cognee/tasks/ingestion/transform_data.py b/cognee/tasks/ingestion/transform_data.py index c2ea86c47..898ac6e71 100644 --- a/cognee/tasks/ingestion/transform_data.py +++ b/cognee/tasks/ingestion/transform_data.py @@ -8,11 +8,11 @@ def get_data_from_llama_index(data_point: Union[Document, ImageDocument], datase if type(data_point) == Document: file_path = data_point.metadata.get("file_path") if file_path is None: - file_path = save_data_to_file(data_point.text, dataset_name) + file_path = save_data_to_file(data_point.text) return file_path return file_path elif type(data_point) == ImageDocument: if data_point.image_path is None: - file_path = save_data_to_file(data_point.text, dataset_name) + file_path = save_data_to_file(data_point.text) return file_path return data_point.image_path \ No newline at end of file