53 lines
1.5 KiB
Python
53 lines
1.5 KiB
Python
import asyncio
|
|
from uuid import UUID, uuid4
|
|
from typing import Union, BinaryIO, List
|
|
import cognitive_architecture.modules.ingestion as ingestion
|
|
from cognitive_architecture.infrastructure import infrastructure_config
|
|
|
|
class DatasetException(Exception):
|
|
message: str
|
|
|
|
def __init__(self, message: str):
|
|
self.message = message
|
|
|
|
|
|
async def add_standalone(
|
|
data: Union[str, BinaryIO, List[Union[str, BinaryIO]]],
|
|
dataset_id: UUID = uuid4(),
|
|
dataset_name: str = None
|
|
):
|
|
db_engine = infrastructure_config.get_config()["database_engine"]
|
|
if db_engine.is_db_done is not True:
|
|
await db_engine.ensure_tables()
|
|
|
|
if not data:
|
|
raise DatasetException("Data must be provided to cognee.add(data: str)")
|
|
|
|
if isinstance(data, list):
|
|
promises = []
|
|
|
|
for data_item in data:
|
|
promises.append(add(data_item, dataset_id, dataset_name))
|
|
|
|
results = await asyncio.gather(*promises)
|
|
|
|
return results
|
|
|
|
|
|
if is_data_path(data):
|
|
with open(data.replace("file://", ""), "rb") as file:
|
|
return await add(file, dataset_id, dataset_name)
|
|
|
|
classified_data = ingestion.classify(data)
|
|
|
|
data_id = ingestion.identify(classified_data)
|
|
|
|
await ingestion.save(dataset_id, dataset_name, data_id, classified_data)
|
|
|
|
return dataset_id
|
|
|
|
# await ingestion.vectorize(dataset_id, dataset_name, data_id, classified_data)
|
|
|
|
|
|
def is_data_path(data: str) -> bool:
|
|
return False if not isinstance(data, str) else data.startswith("file://")
|