feat: Add text deduplication
If text is added to cognee it will be saved by hash so the same text can't be stored multiple times Feature COG-505
This commit is contained in:
parent
6bb0f3d8f2
commit
0ce254b262
6 changed files with 21 additions and 20 deletions
|
|
@ -33,11 +33,11 @@ async def add(data: Union[BinaryIO, List[BinaryIO], str, List[str]], dataset_nam
|
||||||
|
|
||||||
# data is text
|
# data is text
|
||||||
else:
|
else:
|
||||||
file_path = save_data_to_file(data, dataset_name)
|
file_path = save_data_to_file(data)
|
||||||
return await add([file_path], dataset_name)
|
return await add([file_path], dataset_name)
|
||||||
|
|
||||||
if hasattr(data, "file"):
|
if hasattr(data, "file"):
|
||||||
file_path = save_data_to_file(data.file, dataset_name, filename = data.filename)
|
file_path = save_data_to_file(data.file, filename = data.filename)
|
||||||
return await add([file_path], dataset_name)
|
return await add([file_path], dataset_name)
|
||||||
|
|
||||||
# data is a list of file paths or texts
|
# data is a list of file paths or texts
|
||||||
|
|
@ -45,13 +45,13 @@ async def add(data: Union[BinaryIO, List[BinaryIO], str, List[str]], dataset_nam
|
||||||
|
|
||||||
for data_item in data:
|
for data_item in data:
|
||||||
if hasattr(data_item, "file"):
|
if hasattr(data_item, "file"):
|
||||||
file_paths.append(save_data_to_file(data_item, dataset_name, filename = data_item.filename))
|
file_paths.append(save_data_to_file(data_item, filename = data_item.filename))
|
||||||
elif isinstance(data_item, str) and (
|
elif isinstance(data_item, str) and (
|
||||||
data_item.startswith("/") or data_item.startswith("file://")
|
data_item.startswith("/") or data_item.startswith("file://")
|
||||||
):
|
):
|
||||||
file_paths.append(data_item)
|
file_paths.append(data_item)
|
||||||
elif isinstance(data_item, str):
|
elif isinstance(data_item, str):
|
||||||
file_paths.append(save_data_to_file(data_item, dataset_name))
|
file_paths.append(save_data_to_file(data_item))
|
||||||
|
|
||||||
if len(file_paths) > 0:
|
if len(file_paths) > 0:
|
||||||
return await add_files(file_paths, dataset_name, user)
|
return await add_files(file_paths, dataset_name, user)
|
||||||
|
|
|
||||||
|
|
@ -1,25 +1,28 @@
|
||||||
import string
|
import os.path
|
||||||
import random
|
import hashlib
|
||||||
from typing import BinaryIO, Union
|
from typing import BinaryIO, Union
|
||||||
from cognee.base_config import get_base_config
|
from cognee.base_config import get_base_config
|
||||||
from cognee.infrastructure.files.storage import LocalStorage
|
from cognee.infrastructure.files.storage import LocalStorage
|
||||||
from .classify import classify
|
from .classify import classify
|
||||||
|
|
||||||
def save_data_to_file(data: Union[str, BinaryIO], dataset_name: str, filename: str = None):
|
def save_data_to_file(data: Union[str, BinaryIO], filename: str = None):
|
||||||
base_config = get_base_config()
|
base_config = get_base_config()
|
||||||
data_directory_path = base_config.data_root_directory
|
data_directory_path = base_config.data_root_directory
|
||||||
|
|
||||||
classified_data = classify(data, filename)
|
classified_data = classify(data, filename)
|
||||||
|
|
||||||
storage_path = data_directory_path + "/" + dataset_name.replace(".", "/")
|
storage_path = os.path.join(data_directory_path, "data")
|
||||||
LocalStorage.ensure_directory_exists(storage_path)
|
LocalStorage.ensure_directory_exists(storage_path)
|
||||||
|
|
||||||
file_metadata = classified_data.get_metadata()
|
file_metadata = classified_data.get_metadata()
|
||||||
if "name" not in file_metadata or file_metadata["name"] is None:
|
if "name" not in file_metadata or file_metadata["name"] is None:
|
||||||
letters = string.ascii_lowercase
|
data_contents = classified_data.get_data().encode('utf-8')
|
||||||
random_string = "".join(random.choice(letters) for _ in range(32))
|
hash_contents = hashlib.md5(data_contents).hexdigest()
|
||||||
file_metadata["name"] = "text_" + random_string + ".txt"
|
file_metadata["name"] = "text_" + hash_contents + ".txt"
|
||||||
file_name = file_metadata["name"]
|
file_name = file_metadata["name"]
|
||||||
LocalStorage(storage_path).store(file_name, classified_data.get_data())
|
|
||||||
|
# Don't save file if it already exists
|
||||||
|
if not os.path.isfile(os.path.join(storage_path, file_name)):
|
||||||
|
LocalStorage(storage_path).store(file_name, classified_data.get_data())
|
||||||
|
|
||||||
return "file://" + storage_path + "/" + file_name
|
return "file://" + storage_path + "/" + file_name
|
||||||
|
|
|
||||||
|
|
@ -90,8 +90,6 @@ async def ingest_data_with_metadata(data: Any, dataset_name: str, user: User):
|
||||||
mime_type = file_metadata["mime_type"]
|
mime_type = file_metadata["mime_type"]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Check if data is already in dataset
|
# Check if data is already in dataset
|
||||||
dataset_data = (
|
dataset_data = (
|
||||||
await session.execute(select(DatasetData).filter(DatasetData.data_id == data_id,
|
await session.execute(select(DatasetData).filter(DatasetData.data_id == data_id,
|
||||||
|
|
|
||||||
|
|
@ -7,7 +7,7 @@ def save_data_item_to_storage(data_item: Union[BinaryIO, str], dataset_name: str
|
||||||
|
|
||||||
# data is a file object coming from upload.
|
# data is a file object coming from upload.
|
||||||
if hasattr(data_item, "file"):
|
if hasattr(data_item, "file"):
|
||||||
file_path = save_data_to_file(data_item.file, dataset_name, filename=data_item.filename)
|
file_path = save_data_to_file(data_item.file, filename=data_item.filename)
|
||||||
|
|
||||||
elif isinstance(data_item, str):
|
elif isinstance(data_item, str):
|
||||||
# data is a file path
|
# data is a file path
|
||||||
|
|
@ -15,7 +15,7 @@ def save_data_item_to_storage(data_item: Union[BinaryIO, str], dataset_name: str
|
||||||
file_path = data_item.replace("file://", "")
|
file_path = data_item.replace("file://", "")
|
||||||
# data is text
|
# data is text
|
||||||
else:
|
else:
|
||||||
file_path = save_data_to_file(data_item, dataset_name)
|
file_path = save_data_to_file(data_item)
|
||||||
else:
|
else:
|
||||||
raise IngestionError(message=f"Data type not supported: {type(data_item)}")
|
raise IngestionError(message=f"Data type not supported: {type(data_item)}")
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -17,7 +17,7 @@ async def save_data_item_with_metadata_to_storage(
|
||||||
# data is a file object coming from upload.
|
# data is a file object coming from upload.
|
||||||
elif hasattr(data_item, "file"):
|
elif hasattr(data_item, "file"):
|
||||||
file_path = save_data_to_file(
|
file_path = save_data_to_file(
|
||||||
data_item.file, dataset_name, filename=data_item.filename
|
data_item.file, filename=data_item.filename
|
||||||
)
|
)
|
||||||
|
|
||||||
elif isinstance(data_item, str):
|
elif isinstance(data_item, str):
|
||||||
|
|
@ -26,7 +26,7 @@ async def save_data_item_with_metadata_to_storage(
|
||||||
file_path = data_item.replace("file://", "")
|
file_path = data_item.replace("file://", "")
|
||||||
# data is text
|
# data is text
|
||||||
else:
|
else:
|
||||||
file_path = save_data_to_file(data_item, dataset_name)
|
file_path = save_data_to_file(data_item)
|
||||||
else:
|
else:
|
||||||
raise IngestionError(message=f"Data type not supported: {type(data_item)}")
|
raise IngestionError(message=f"Data type not supported: {type(data_item)}")
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -8,11 +8,11 @@ def get_data_from_llama_index(data_point: Union[Document, ImageDocument], datase
|
||||||
if type(data_point) == Document:
|
if type(data_point) == Document:
|
||||||
file_path = data_point.metadata.get("file_path")
|
file_path = data_point.metadata.get("file_path")
|
||||||
if file_path is None:
|
if file_path is None:
|
||||||
file_path = save_data_to_file(data_point.text, dataset_name)
|
file_path = save_data_to_file(data_point.text)
|
||||||
return file_path
|
return file_path
|
||||||
return file_path
|
return file_path
|
||||||
elif type(data_point) == ImageDocument:
|
elif type(data_point) == ImageDocument:
|
||||||
if data_point.image_path is None:
|
if data_point.image_path is None:
|
||||||
file_path = save_data_to_file(data_point.text, dataset_name)
|
file_path = save_data_to_file(data_point.text)
|
||||||
return file_path
|
return file_path
|
||||||
return data_point.image_path
|
return data_point.image_path
|
||||||
Loading…
Add table
Reference in a new issue