91 lines
3.1 KiB
Python
91 lines
3.1 KiB
Python
from typing import List, Union
|
|
from os import path, listdir
|
|
import asyncio
|
|
import dlt
|
|
import duckdb
|
|
from unstructured.cleaners.core import clean
|
|
from cognitive_architecture.root_dir import get_absolute_path
|
|
import cognitive_architecture.modules.ingestion as ingestion
|
|
from cognitive_architecture.infrastructure.files import get_file_metadata
|
|
from cognitive_architecture.infrastructure.files.storage import LocalStorage
|
|
|
|
async def add(file_paths: Union[str, List[str]], dataset_name: str = None):
|
|
if isinstance(file_paths, str):
|
|
# Directory path provided, we need to extract the file paths and dataset name
|
|
|
|
def list_dir_files(root_dir_path: str, parent_dir: str = "root"):
|
|
datasets = {}
|
|
|
|
for file_or_dir in listdir(root_dir_path):
|
|
if path.isdir(path.join(root_dir_path, file_or_dir)):
|
|
dataset_name = file_or_dir if parent_dir == "root" else parent_dir + "." + file_or_dir
|
|
dataset_name = clean(dataset_name.replace(" ", "_"))
|
|
|
|
nested_datasets = list_dir_files(path.join(root_dir_path, file_or_dir), dataset_name)
|
|
|
|
for dataset in nested_datasets:
|
|
datasets[dataset] = nested_datasets[dataset]
|
|
else:
|
|
if parent_dir not in datasets:
|
|
datasets[parent_dir] = []
|
|
|
|
datasets[parent_dir].append(path.join(root_dir_path, file_or_dir))
|
|
|
|
return datasets
|
|
|
|
datasets = list_dir_files(file_paths)
|
|
|
|
results = []
|
|
|
|
for key in datasets:
|
|
if dataset_name is not None and not key.startswith(dataset_name):
|
|
continue
|
|
|
|
results.append(add(datasets[key], dataset_name = key))
|
|
|
|
return await asyncio.gather(*results)
|
|
|
|
|
|
db_path = get_absolute_path("./data/cognee")
|
|
db_location = f"{db_path}/cognee.duckdb"
|
|
|
|
LocalStorage.ensure_directory_exists(db_path)
|
|
|
|
db = duckdb.connect(db_location)
|
|
|
|
destination = dlt.destinations.duckdb(
|
|
credentials = db,
|
|
)
|
|
|
|
pipeline = dlt.pipeline(
|
|
pipeline_name = "file_load_from_filesystem",
|
|
destination = destination,
|
|
)
|
|
|
|
@dlt.resource(standalone = True, merge_key = "id")
|
|
def data_resources(file_paths: str):
|
|
for file_path in file_paths:
|
|
with open(file_path.replace("file://", ""), mode = "rb") as file:
|
|
classified_data = ingestion.classify(file)
|
|
|
|
data_id = ingestion.identify(classified_data)
|
|
|
|
file_metadata = get_file_metadata(classified_data.get_data())
|
|
|
|
yield {
|
|
"id": data_id,
|
|
"name": file_metadata["name"],
|
|
"file_path": file_metadata["file_path"],
|
|
"extension": file_metadata["extension"],
|
|
"mime_type": file_metadata["mime_type"],
|
|
"keywords": "|".join(file_metadata["keywords"]),
|
|
}
|
|
|
|
run_info = pipeline.run(
|
|
data_resources(file_paths),
|
|
table_name = "file_metadata",
|
|
dataset_name = dataset_name,
|
|
write_disposition = "merge",
|
|
)
|
|
|
|
return run_info
|