feat: Add text deduplication

If text is added to cognee it will be saved by hash so the same text can't be stored multiple times

Feature COG-505
This commit is contained in:
Igor Ilic 2024-12-04 17:19:29 +01:00
parent 6bb0f3d8f2
commit 0ce254b262
6 changed files with 21 additions and 20 deletions

View file

@ -33,11 +33,11 @@ async def add(data: Union[BinaryIO, List[BinaryIO], str, List[str]], dataset_nam
# data is text
else:
file_path = save_data_to_file(data, dataset_name)
file_path = save_data_to_file(data)
return await add([file_path], dataset_name)
if hasattr(data, "file"):
file_path = save_data_to_file(data.file, dataset_name, filename = data.filename)
file_path = save_data_to_file(data.file, filename = data.filename)
return await add([file_path], dataset_name)
# data is a list of file paths or texts
@ -45,13 +45,13 @@ async def add(data: Union[BinaryIO, List[BinaryIO], str, List[str]], dataset_nam
for data_item in data:
if hasattr(data_item, "file"):
file_paths.append(save_data_to_file(data_item, dataset_name, filename = data_item.filename))
file_paths.append(save_data_to_file(data_item, filename = data_item.filename))
elif isinstance(data_item, str) and (
data_item.startswith("/") or data_item.startswith("file://")
):
file_paths.append(data_item)
elif isinstance(data_item, str):
file_paths.append(save_data_to_file(data_item, dataset_name))
file_paths.append(save_data_to_file(data_item))
if len(file_paths) > 0:
return await add_files(file_paths, dataset_name, user)

View file

@ -1,25 +1,28 @@
import string
import random
import os.path
import hashlib
from typing import BinaryIO, Union
from cognee.base_config import get_base_config
from cognee.infrastructure.files.storage import LocalStorage
from .classify import classify
def save_data_to_file(data: Union[str, BinaryIO], dataset_name: str, filename: str = None):
def save_data_to_file(data: Union[str, BinaryIO], filename: str = None):
base_config = get_base_config()
data_directory_path = base_config.data_root_directory
classified_data = classify(data, filename)
storage_path = data_directory_path + "/" + dataset_name.replace(".", "/")
storage_path = os.path.join(data_directory_path, "data")
LocalStorage.ensure_directory_exists(storage_path)
file_metadata = classified_data.get_metadata()
if "name" not in file_metadata or file_metadata["name"] is None:
letters = string.ascii_lowercase
random_string = "".join(random.choice(letters) for _ in range(32))
file_metadata["name"] = "text_" + random_string + ".txt"
data_contents = classified_data.get_data().encode('utf-8')
hash_contents = hashlib.md5(data_contents).hexdigest()
file_metadata["name"] = "text_" + hash_contents + ".txt"
file_name = file_metadata["name"]
LocalStorage(storage_path).store(file_name, classified_data.get_data())
# Don't save file if it already exists
if not os.path.isfile(os.path.join(storage_path, file_name)):
LocalStorage(storage_path).store(file_name, classified_data.get_data())
return "file://" + storage_path + "/" + file_name

View file

@ -90,8 +90,6 @@ async def ingest_data_with_metadata(data: Any, dataset_name: str, user: User):
mime_type = file_metadata["mime_type"]
)
# Check if data is already in dataset
dataset_data = (
await session.execute(select(DatasetData).filter(DatasetData.data_id == data_id,

View file

@ -7,7 +7,7 @@ def save_data_item_to_storage(data_item: Union[BinaryIO, str], dataset_name: str
# data is a file object coming from upload.
if hasattr(data_item, "file"):
file_path = save_data_to_file(data_item.file, dataset_name, filename=data_item.filename)
file_path = save_data_to_file(data_item.file, filename=data_item.filename)
elif isinstance(data_item, str):
# data is a file path
@ -15,7 +15,7 @@ def save_data_item_to_storage(data_item: Union[BinaryIO, str], dataset_name: str
file_path = data_item.replace("file://", "")
# data is text
else:
file_path = save_data_to_file(data_item, dataset_name)
file_path = save_data_to_file(data_item)
else:
raise IngestionError(message=f"Data type not supported: {type(data_item)}")

View file

@ -17,7 +17,7 @@ async def save_data_item_with_metadata_to_storage(
# data is a file object coming from upload.
elif hasattr(data_item, "file"):
file_path = save_data_to_file(
data_item.file, dataset_name, filename=data_item.filename
data_item.file, filename=data_item.filename
)
elif isinstance(data_item, str):
@ -26,7 +26,7 @@ async def save_data_item_with_metadata_to_storage(
file_path = data_item.replace("file://", "")
# data is text
else:
file_path = save_data_to_file(data_item, dataset_name)
file_path = save_data_to_file(data_item)
else:
raise IngestionError(message=f"Data type not supported: {type(data_item)}")

View file

@ -8,11 +8,11 @@ def get_data_from_llama_index(data_point: Union[Document, ImageDocument], datase
if type(data_point) == Document:
file_path = data_point.metadata.get("file_path")
if file_path is None:
file_path = save_data_to_file(data_point.text, dataset_name)
file_path = save_data_to_file(data_point.text)
return file_path
return file_path
elif type(data_point) == ImageDocument:
if data_point.image_path is None:
file_path = save_data_to_file(data_point.text, dataset_name)
file_path = save_data_to_file(data_point.text)
return file_path
return data_point.image_path