cognee/cognitive_architecture/api/v1/add/add.py

from typing import List, Union
from os import path, listdir
import asyncio
import dlt
import duckdb
from unstructured.cleaners.core import clean
from cognitive_architecture.root_dir import get_absolute_path
import cognitive_architecture.modules.ingestion as ingestion
from cognitive_architecture.infrastructure.files import get_file_metadata
from cognitive_architecture.infrastructure.files.storage import LocalStorage

async def add(file_paths: Union[str, List[str]], dataset_name: str = None):
    if isinstance(file_paths, str):
        # Directory path provided, we need to extract the file paths and dataset name

        def list_dir_files(root_dir_path: str, parent_dir: str = "root"):
            datasets = {}

            for file_or_dir in listdir(root_dir_path):
                if path.isdir(path.join(root_dir_path, file_or_dir)):
                    dataset_name = file_or_dir if parent_dir == "root" else parent_dir + "." + file_or_dir
                    dataset_name = clean(dataset_name.replace(" ", "_"))

                    nested_datasets = list_dir_files(path.join(root_dir_path, file_or_dir), dataset_name)

                    for dataset in nested_datasets:
                        datasets[dataset] = nested_datasets[dataset]
                else:
                    if parent_dir not in datasets:
                        datasets[parent_dir] = []

                    datasets[parent_dir].append(path.join(root_dir_path, file_or_dir))

            return datasets

        datasets = list_dir_files(file_paths)

        results = []

        for key in datasets:
            if dataset_name is not None and not key.startswith(dataset_name):
                continue

            results.append(add(datasets[key], dataset_name = key))

        return await asyncio.gather(*results)


    db_path = get_absolute_path("./data/cognee")
    db_location = f"{db_path}/cognee.duckdb"

    LocalStorage.ensure_directory_exists(db_path)

    db = duckdb.connect(db_location)

    destination = dlt.destinations.duckdb(
        credentials = db,
    )

    pipeline = dlt.pipeline(
        pipeline_name = "file_load_from_filesystem",
        destination = destination,
    )

    @dlt.resource(standalone = True, merge_key = "id")
    def data_resources(file_paths: str):
        for file_path in file_paths:
            with open(file_path.replace("file://", ""), mode = "rb") as file:
                classified_data = ingestion.classify(file)

                data_id = ingestion.identify(classified_data)

                file_metadata = get_file_metadata(classified_data.get_data())

                yield {
                    "id": data_id,
                    "name": file_metadata["name"],
                    "file_path": file_metadata["file_path"],
                    "extension": file_metadata["extension"],
                    "mime_type": file_metadata["mime_type"],
                    "keywords": "|".join(file_metadata["keywords"]),
                }

    run_info = pipeline.run(
        data_resources(file_paths),
        table_name = "file_metadata",
        dataset_name = dataset_name,
        write_disposition = "merge",
    )

    return run_info