feat: Add text deduplication
If text is added to cognee it will be saved by hash so the same text can't be stored multiple times Feature COG-505
This commit is contained in:
parent
6bb0f3d8f2
commit
0ce254b262
6 changed files with 21 additions and 20 deletions
|
|
@ -33,11 +33,11 @@ async def add(data: Union[BinaryIO, List[BinaryIO], str, List[str]], dataset_nam
|
|||
|
||||
# data is text
|
||||
else:
|
||||
file_path = save_data_to_file(data, dataset_name)
|
||||
file_path = save_data_to_file(data)
|
||||
return await add([file_path], dataset_name)
|
||||
|
||||
if hasattr(data, "file"):
|
||||
file_path = save_data_to_file(data.file, dataset_name, filename = data.filename)
|
||||
file_path = save_data_to_file(data.file, filename = data.filename)
|
||||
return await add([file_path], dataset_name)
|
||||
|
||||
# data is a list of file paths or texts
|
||||
|
|
@ -45,13 +45,13 @@ async def add(data: Union[BinaryIO, List[BinaryIO], str, List[str]], dataset_nam
|
|||
|
||||
for data_item in data:
|
||||
if hasattr(data_item, "file"):
|
||||
file_paths.append(save_data_to_file(data_item, dataset_name, filename = data_item.filename))
|
||||
file_paths.append(save_data_to_file(data_item, filename = data_item.filename))
|
||||
elif isinstance(data_item, str) and (
|
||||
data_item.startswith("/") or data_item.startswith("file://")
|
||||
):
|
||||
file_paths.append(data_item)
|
||||
elif isinstance(data_item, str):
|
||||
file_paths.append(save_data_to_file(data_item, dataset_name))
|
||||
file_paths.append(save_data_to_file(data_item))
|
||||
|
||||
if len(file_paths) > 0:
|
||||
return await add_files(file_paths, dataset_name, user)
|
||||
|
|
|
|||
|
|
@ -1,25 +1,28 @@
|
|||
import string
|
||||
import random
|
||||
import os.path
|
||||
import hashlib
|
||||
from typing import BinaryIO, Union
|
||||
from cognee.base_config import get_base_config
|
||||
from cognee.infrastructure.files.storage import LocalStorage
|
||||
from .classify import classify
|
||||
|
||||
def save_data_to_file(data: Union[str, BinaryIO], dataset_name: str, filename: str = None):
|
||||
def save_data_to_file(data: Union[str, BinaryIO], filename: str = None):
|
||||
base_config = get_base_config()
|
||||
data_directory_path = base_config.data_root_directory
|
||||
|
||||
classified_data = classify(data, filename)
|
||||
|
||||
storage_path = data_directory_path + "/" + dataset_name.replace(".", "/")
|
||||
storage_path = os.path.join(data_directory_path, "data")
|
||||
LocalStorage.ensure_directory_exists(storage_path)
|
||||
|
||||
file_metadata = classified_data.get_metadata()
|
||||
if "name" not in file_metadata or file_metadata["name"] is None:
|
||||
letters = string.ascii_lowercase
|
||||
random_string = "".join(random.choice(letters) for _ in range(32))
|
||||
file_metadata["name"] = "text_" + random_string + ".txt"
|
||||
data_contents = classified_data.get_data().encode('utf-8')
|
||||
hash_contents = hashlib.md5(data_contents).hexdigest()
|
||||
file_metadata["name"] = "text_" + hash_contents + ".txt"
|
||||
file_name = file_metadata["name"]
|
||||
LocalStorage(storage_path).store(file_name, classified_data.get_data())
|
||||
|
||||
# Don't save file if it already exists
|
||||
if not os.path.isfile(os.path.join(storage_path, file_name)):
|
||||
LocalStorage(storage_path).store(file_name, classified_data.get_data())
|
||||
|
||||
return "file://" + storage_path + "/" + file_name
|
||||
|
|
|
|||
|
|
@ -90,8 +90,6 @@ async def ingest_data_with_metadata(data: Any, dataset_name: str, user: User):
|
|||
mime_type = file_metadata["mime_type"]
|
||||
)
|
||||
|
||||
|
||||
|
||||
# Check if data is already in dataset
|
||||
dataset_data = (
|
||||
await session.execute(select(DatasetData).filter(DatasetData.data_id == data_id,
|
||||
|
|
|
|||
|
|
@ -7,7 +7,7 @@ def save_data_item_to_storage(data_item: Union[BinaryIO, str], dataset_name: str
|
|||
|
||||
# data is a file object coming from upload.
|
||||
if hasattr(data_item, "file"):
|
||||
file_path = save_data_to_file(data_item.file, dataset_name, filename=data_item.filename)
|
||||
file_path = save_data_to_file(data_item.file, filename=data_item.filename)
|
||||
|
||||
elif isinstance(data_item, str):
|
||||
# data is a file path
|
||||
|
|
@ -15,7 +15,7 @@ def save_data_item_to_storage(data_item: Union[BinaryIO, str], dataset_name: str
|
|||
file_path = data_item.replace("file://", "")
|
||||
# data is text
|
||||
else:
|
||||
file_path = save_data_to_file(data_item, dataset_name)
|
||||
file_path = save_data_to_file(data_item)
|
||||
else:
|
||||
raise IngestionError(message=f"Data type not supported: {type(data_item)}")
|
||||
|
||||
|
|
|
|||
|
|
@ -17,7 +17,7 @@ async def save_data_item_with_metadata_to_storage(
|
|||
# data is a file object coming from upload.
|
||||
elif hasattr(data_item, "file"):
|
||||
file_path = save_data_to_file(
|
||||
data_item.file, dataset_name, filename=data_item.filename
|
||||
data_item.file, filename=data_item.filename
|
||||
)
|
||||
|
||||
elif isinstance(data_item, str):
|
||||
|
|
@ -26,7 +26,7 @@ async def save_data_item_with_metadata_to_storage(
|
|||
file_path = data_item.replace("file://", "")
|
||||
# data is text
|
||||
else:
|
||||
file_path = save_data_to_file(data_item, dataset_name)
|
||||
file_path = save_data_to_file(data_item)
|
||||
else:
|
||||
raise IngestionError(message=f"Data type not supported: {type(data_item)}")
|
||||
|
||||
|
|
|
|||
|
|
@ -8,11 +8,11 @@ def get_data_from_llama_index(data_point: Union[Document, ImageDocument], datase
|
|||
if type(data_point) == Document:
|
||||
file_path = data_point.metadata.get("file_path")
|
||||
if file_path is None:
|
||||
file_path = save_data_to_file(data_point.text, dataset_name)
|
||||
file_path = save_data_to_file(data_point.text)
|
||||
return file_path
|
||||
return file_path
|
||||
elif type(data_point) == ImageDocument:
|
||||
if data_point.image_path is None:
|
||||
file_path = save_data_to_file(data_point.text, dataset_name)
|
||||
file_path = save_data_to_file(data_point.text)
|
||||
return file_path
|
||||
return data_point.image_path
|
||||
Loading…
Add table
Reference in a new issue