cognee/cognitive_architecture/api/v1/add/add.py
2024-03-13 10:19:03 +01:00

91 lines
3.1 KiB
Python

from typing import List, Union
from os import path, listdir
import asyncio
import dlt
import duckdb
from unstructured.cleaners.core import clean
from cognitive_architecture.root_dir import get_absolute_path
import cognitive_architecture.modules.ingestion as ingestion
from cognitive_architecture.infrastructure.files import get_file_metadata
from cognitive_architecture.infrastructure.files.storage import LocalStorage
async def add(file_paths: Union[str, List[str]], dataset_name: str = None):
if isinstance(file_paths, str):
# Directory path provided, we need to extract the file paths and dataset name
def list_dir_files(root_dir_path: str, parent_dir: str = "root"):
datasets = {}
for file_or_dir in listdir(root_dir_path):
if path.isdir(path.join(root_dir_path, file_or_dir)):
dataset_name = file_or_dir if parent_dir == "root" else parent_dir + "." + file_or_dir
dataset_name = clean(dataset_name.replace(" ", "_"))
nested_datasets = list_dir_files(path.join(root_dir_path, file_or_dir), dataset_name)
for dataset in nested_datasets:
datasets[dataset] = nested_datasets[dataset]
else:
if parent_dir not in datasets:
datasets[parent_dir] = []
datasets[parent_dir].append(path.join(root_dir_path, file_or_dir))
return datasets
datasets = list_dir_files(file_paths)
results = []
for key in datasets:
if dataset_name is not None and not key.startswith(dataset_name):
continue
results.append(add(datasets[key], dataset_name = key))
return await asyncio.gather(*results)
db_path = get_absolute_path("./data/cognee")
db_location = f"{db_path}/cognee.duckdb"
LocalStorage.ensure_directory_exists(db_path)
db = duckdb.connect(db_location)
destination = dlt.destinations.duckdb(
credentials = db,
)
pipeline = dlt.pipeline(
pipeline_name = "file_load_from_filesystem",
destination = destination,
)
@dlt.resource(standalone = True, merge_key = "id")
def data_resources(file_paths: str):
for file_path in file_paths:
with open(file_path.replace("file://", ""), mode = "rb") as file:
classified_data = ingestion.classify(file)
data_id = ingestion.identify(classified_data)
file_metadata = get_file_metadata(classified_data.get_data())
yield {
"id": data_id,
"name": file_metadata["name"],
"file_path": file_metadata["file_path"],
"extension": file_metadata["extension"],
"mime_type": file_metadata["mime_type"],
"keywords": "|".join(file_metadata["keywords"]),
}
run_info = pipeline.run(
data_resources(file_paths),
table_name = "file_metadata",
dataset_name = dataset_name,
write_disposition = "merge",
)
return run_info