feat: save files metadata in duckdb

2024-03-12 13:42:51 +01:00 · 2024-03-12 13:42:51 +01:00 · 8cbf488e59
commit 8cbf488e59
parent 279c7a0789
75 changed files with 5096 additions and 180 deletions
--- a/.DS_Store
+++ b/.DS_Store
--- a/.data/izmene/Izmenjen-clan-17-Pravilnika-o-sadrzini-nacinu-i-postupku-izrade-i-nacin-vrsenja-kontrole-tehnicke-dokumentacije-prema-klasi-i-nameni-objekata.pdf
+++ b/.data/izmene/Izmenjen-clan-17-Pravilnika-o-sadrzini-nacinu-i-postupku-izrade-i-nacin-vrsenja-kontrole-tehnicke-dokumentacije-prema-klasi-i-nameni-objekata.pdf
--- a/.data/pravilnik/Pravilnik-o-izgledu-sadrzini-i-mestu-postavljanja-gradilisne-table.pdf
+++ b/.data/pravilnik/Pravilnik-o-izgledu-sadrzini-i-mestu-postavljanja-gradilisne-table.pdf
--- a/.data/pravilnik/Pravilnik-o-katastarskom-premeru-i-katastru-nepokretnosti.pdf
+++ b/.data/pravilnik/Pravilnik-o-katastarskom-premeru-i-katastru-nepokretnosti.pdf
--- a/.data/pravilnik/Pravilnik-o-nacinu-zatvaranja-i-obelezavanju-zatvorenog-gradilista.pdf
+++ b/.data/pravilnik/Pravilnik-o-nacinu-zatvaranja-i-obelezavanju-zatvorenog-gradilista.pdf
--- a/.data/pravilnik/Pravilnik-o-objektima-na-koje-se-ne-primenjuju-pojedine-odredbe-Zakona-o-planiranju-i-izgradnji.pdf
+++ b/.data/pravilnik/Pravilnik-o-objektima-na-koje-se-ne-primenjuju-pojedine-odredbe-Zakona-o-planiranju-i-izgradnji.pdf
--- a/.data/pravilnik/Pravilnik-o-postupku-sprovodjenja-objedinjene-1.pdf
+++ b/.data/pravilnik/Pravilnik-o-postupku-sprovodjenja-objedinjene-1.pdf
--- a/.data/pravilnik/Pravilnik-o-sadrzini-i-nacinu-vrsenja-tehnickog-pregleda-objekta.pdf
+++ b/.data/pravilnik/Pravilnik-o-sadrzini-i-nacinu-vrsenja-tehnickog-pregleda-objekta.pdf
--- a/.data/pravilnik/Pravilnik-o-sadrzini-nacinu-i-postupku-izrade-i-nacinu-vrsenja-kontrole-tehnicke-dokumentacije-prema-klasi-i-nameni-objekata-2018.pdf
+++ b/.data/pravilnik/Pravilnik-o-sadrzini-nacinu-i-postupku-izrade-i-nacinu-vrsenja-kontrole-tehnicke-dokumentacije-prema-klasi-i-nameni-objekata-2018.pdf
--- a/.data/pravilnik/Pravilnik-o-sadrzini-postupku-i-nacinu-donosenja-programa-uredjivanja-gradjevinskog-zemljista.pdf
+++ b/.data/pravilnik/Pravilnik-o-sadrzini-postupku-i-nacinu-donosenja-programa-uredjivanja-gradjevinskog-zemljista.pdf
--- a/.data/pravilnik/Pravilnik-o-uslovima-i-normativima-za-projektovanje-stambenih-zgrada-i-stanova.pdf
+++ b/.data/pravilnik/Pravilnik-o-uslovima-i-normativima-za-projektovanje-stambenih-zgrada-i-stanova.pdf
--- a/.data/pravilnik/Pravilnik-o-uslovima-osiguranja-od-profesionalne-odgovornosti.pdf
+++ b/.data/pravilnik/Pravilnik-o-uslovima-osiguranja-od-profesionalne-odgovornosti.pdf
--- a/efikasnost/Pravilnik-o-energetskoj-efikasnosti-zgrada.pdf
+++ b/efikasnost/Pravilnik-o-energetskoj-efikasnosti-zgrada.pdf
--- a/efikasnost/sertifikati/Pravilnik-o-uslovima-sadrzini-i-nacinu-izdavanja-sertifikata-o-energetskim-svojstvima-zgrada.pdf
+++ b/efikasnost/sertifikati/Pravilnik-o-uslovima-sadrzini-i-nacinu-izdavanja-sertifikata-o-energetskim-svojstvima-zgrada.pdf
--- a/.dlt/config.toml
+++ b/.dlt/config.toml
@ -0,0 +1,6 @@
+# put your configuration values here
+
+[runtime]
+log_level="WARNING"  # the system log level of dlt
+# use the dlthub_telemetry setting to enable/disable anonymous usage data reporting, see https://dlthub.com/docs/telemetry
+dlthub_telemetry = false
--- a/.dlt/secrets.toml
+++ b/.dlt/secrets.toml
@ -0,0 +1,5 @@
+# put your secret values and credentials here. do not share this file and do not push it to github
+
+[destination.qdrant.credentials]
+location = "https://cff4594b-c2de-4fcf-8365-4c1d3a1c1429.us-east4-0.gcp.cloud.qdrant.io:6333"
+api_key = "K5BKjVGR8Qn4pVMk9nPFNTqITu3QVGR1O8qlDDH6kk52HUwB4lRjjw"
--- a/.python-version
+++ b/.python-version
@ -0,0 +1 @@
+3.10.13
--- a/cognee.add.ipynb
+++ b/cognee.add.ipynb
@ -0,0 +1,156 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "823c799a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from os import listdir, path\n",
+    "from uuid import uuid5, UUID\n",
+    "from cognitive_architecture import add\n",
+    "\n",
+    "data_path = path.abspath(\".data\")\n",
+    "pdf_files = [file for file in listdir(data_path) if path.isfile(path.join(data_path, file))]\n",
+    "\n",
+    "await add(\n",
+    "    list(map(\n",
+    "        lambda file_path: f\"file://{path.join(data_path, file_path)}\",\n",
+    "        pdf_files\n",
+    "    ))[:3],\n",
+    "    uuid5(UUID(\"00000000-0000-0000-0000-000000000000\"), \"pdf_files_cognee\"),\n",
+    "    \"test-dataset\"\n",
+    ")\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "c4d5a399",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Pipeline file_load_from_filesystem load step completed in 0.39 seconds\n",
+      "1 load package(s) were loaded to destination duckdb and into dataset pravilnik_energetska_efikasnost\n",
+      "The duckdb destination used duckdb:///:external: location to store data\n",
+      "Load package 1710243999.1369941 is LOADED and contains no failed jobs\n",
+      "Pipeline file_load_from_filesystem load step completed in 0.40 seconds\n",
+      "1 load package(s) were loaded to destination duckdb and into dataset pravilnik_energetska_efikasnost_sertifikati\n",
+      "The duckdb destination used duckdb:///:external: location to store data\n",
+      "Load package 1710246971.055336 is LOADED and contains no failed jobs\n"
+     ]
+    }
+   ],
+   "source": [
+    "from os import listdir, path\n",
+    "from uuid import uuid5, UUID\n",
+    "from cognitive_architecture import add_dlt\n",
+    "\n",
+    "data_path = path.abspath(\".data\")\n",
+    "# pdf_files = [file for file in listdir(data_path) if path.isfile(path.join(data_path, file))]\n",
+    "\n",
+    "# await add_dlt(\n",
+    "#     list(map(\n",
+    "#         lambda file_path: f\"file://{path.join(data_path, file_path)}\",\n",
+    "#         pdf_files\n",
+    "#     ))[:5],\n",
+    "#     \"pdf_files\"\n",
+    "# )\n",
+    "\n",
+    "results = await add_dlt(data_path, \"pravilnik.energetska_efikasnost\")\n",
+    "for result in results:\n",
+    "    print(result)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "47edce91",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "/Users/borisarzentar/Projects/Topoteretes/cognee/cognitive_architecture/data/cognee/cognee.duckdb\n",
+      "┌──────────────────────┬──────────────────────┬──────────────────────┬───┬───────────────────┬────────────────┐\n",
+      "│          id          │         name         │      file_path       │ … │   _dlt_load_id    │    _dlt_id     │\n",
+      "│       varchar        │       varchar        │       varchar        │   │      varchar      │    varchar     │\n",
+      "├──────────────────────┼──────────────────────┼──────────────────────┼───┼───────────────────┼────────────────┤\n",
+      "│ 881ecb36-2819-54c3…  │ Izmenjen-clan-17-P…  │ /Users/borisarzent…  │ … │ 1710242259.670676 │ /LPIFEK4ayoivQ │\n",
+      "├──────────────────────┴──────────────────────┴──────────────────────┴───┴───────────────────┴────────────────┤\n",
+      "│ 1 rows                                                                                  8 columns (5 shown) │\n",
+      "└─────────────────────────────────────────────────────────────────────────────────────────────────────────────┘\n",
+      "\n",
+      "┌──────────────────────┬──────────────────────┬──────────────────────┬───┬────────────────────┬────────────────┐\n",
+      "│          id          │         name         │      file_path       │ … │    _dlt_load_id    │    _dlt_id     │\n",
+      "│       varchar        │       varchar        │       varchar        │   │      varchar       │    varchar     │\n",
+      "├──────────────────────┼──────────────────────┼──────────────────────┼───┼────────────────────┼────────────────┤\n",
+      "│ cd1dc11f-397b-5048…  │ Pravilnik-o-sadrzi…  │ /Users/borisarzent…  │ … │ 1710243553.0357041 │ TrxbqUr6PepbTA │\n",
+      "│ 320cee87-d02e-540d…  │ Pravilnik-o-nacinu…  │ /Users/borisarzent…  │ … │ 1710243553.0357041 │ njwnSKr24K1vEQ │\n",
+      "│ bfe85a36-1427-555d…  │ Pravilnik-o-izgled…  │ /Users/borisarzent…  │ … │ 1710243553.0357041 │ hYgSJPKZJoQNHQ │\n",
+      "│ 5767b799-9815-5834…  │ Pravilnik-o-postup…  │ /Users/borisarzent…  │ … │ 1710243553.0357041 │ CAR2xYK53eR9Wg │\n",
+      "│ 9133b38e-d2aa-5916…  │ Pravilnik-o-uslovi…  │ /Users/borisarzent…  │ … │ 1710243553.0357041 │ aiDe0Ggk5JMN/w │\n",
+      "│ bff79816-8610-5dfa…  │ Pravilnik-o-sadrzi…  │ /Users/borisarzent…  │ … │ 1710243553.0357041 │ Fi2fQVYOI1lZ8w │\n",
+      "│ d3fbcf40-abcc-56d4…  │ Pravilnik-o-uslovi…  │ /Users/borisarzent…  │ … │ 1710243553.0357041 │ aw1WBpNI62y+Kg │\n",
+      "│ 826bbd41-e322-5587…  │ Pravilnik-o-katast…  │ /Users/borisarzent…  │ … │ 1710243553.0357041 │ S5QOxjEv51lBBw │\n",
+      "│ f354abe5-bc7e-520f…  │ Pravilnik-o-objekt…  │ /Users/borisarzent…  │ … │ 1710243553.0357041 │ B5CinyB0UGlbng │\n",
+      "│ 1e47801b-2a4f-57cf…  │ Pravilnik-o-sadrzi…  │ /Users/borisarzent…  │ … │ 1710243553.0357041 │ w6CcdYAB8ie+xw │\n",
+      "├──────────────────────┴──────────────────────┴──────────────────────┴───┴────────────────────┴────────────────┤\n",
+      "│ 10 rows                                                                                  8 columns (5 shown) │\n",
+      "└──────────────────────────────────────────────────────────────────────────────────────────────────────────────┘\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "import duckdb\n",
+    "from cognitive_architecture.root_dir import get_absolute_path\n",
+    "\n",
+    "dataset_name = \"pdf_files\"\n",
+    "\n",
+    "db_path = get_absolute_path(\"./data/cognee\")\n",
+    "db_location = db_path + \"/cognee.duckdb\"\n",
+    "print(db_location)\n",
+    "\n",
+    "db = duckdb.connect(db_location)\n",
+    "\n",
+    "izmene = db.sql(f\"SELECT * FROM izmene.file_metadata;\")\n",
+    "\n",
+    "print(izmene)\n",
+    "\n",
+    "pravilnik = db.sql(f\"SELECT * FROM pravilnik.file_metadata;\")\n",
+    "\n",
+    "print(pravilnik)\n",
+    "\n",
+    "\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/cognitive_architecture/.DS_Store
+++ b/cognitive_architecture/.DS_Store
--- a/cognitive_architecture/init.py
+++ b/cognitive_architecture/init.py
@ -0,0 +1,2 @@
+from .api.v1.add.add import add
+from .api.v1.add.add_dlt import add_dlt
--- a/cognitive_architecture/api/v1/add/init.py
+++ b/cognitive_architecture/api/v1/add/init.py
@ -0,0 +1 @@
+from .add import add
--- a/cognitive_architecture/api/v1/add/add.py
+++ b/cognitive_architecture/api/v1/add/add.py
@ -0,0 +1,53 @@
+import asyncio
+from uuid import UUID, uuid4
+from typing import Union, BinaryIO, List
+import cognitive_architecture.modules.ingestion as ingestion
+from cognitive_architecture.infrastructure import infrastructure_config
+
+class DatasetException(Exception):
+    message: str
+
+    def __init__(self, message: str):
+        self.message = message
+
+
+async def add(
+    data: Union[str, BinaryIO, List[Union[str, BinaryIO]]],
+    dataset_id: UUID = uuid4(),
+    dataset_name: str = None
+):
+    db_engine = infrastructure_config.get_config()["database_engine"]
+    if db_engine.is_db_done is not True:
+        await db_engine.ensure_tables()
+
+    if not data:
+        raise DatasetException("Data must be provided to cognee.add(data: str)")
+
+    if isinstance(data, list):
+        promises = []
+
+        for data_item in data:
+            promises.append(add(data_item, dataset_id, dataset_name))
+
+        results = await asyncio.gather(*promises)
+
+        return results
+
+
+    if is_data_path(data):
+        with open(data.replace("file://", ""), "rb") as file:
+            return await add(file, dataset_id, dataset_name)
+
+    classified_data = ingestion.classify(data)
+
+    data_id = ingestion.identify(classified_data)
+
+    await ingestion.save(dataset_id, dataset_name, data_id, classified_data)
+
+    return dataset_id
+
+    # await ingestion.vectorize(dataset_id, dataset_name, data_id, classified_data)
+
+
+def is_data_path(data: str) -> bool:
+    return False if not isinstance(data, str) else data.startswith("file://")
--- a/cognitive_architecture/api/v1/add/add_dlt.py
+++ b/cognitive_architecture/api/v1/add/add_dlt.py
@ -0,0 +1,91 @@
+from typing import List, Union
+from os import path, listdir
+import asyncio
+import dlt
+import duckdb
+from unstructured.cleaners.core import clean
+from cognitive_architecture.root_dir import get_absolute_path
+import cognitive_architecture.modules.ingestion as ingestion
+from cognitive_architecture.infrastructure.files import get_file_metadata
+from cognitive_architecture.infrastructure.files.storage import LocalStorage
+
+async def add_dlt(file_paths: Union[str, List[str]], dataset_name: str = None):
+    if isinstance(file_paths, str):
+        # Directory path provided, we need to extract the file paths and dataset name
+
+        def list_dir_files(root_dir_path: str, parent_dir: str = "root"):
+            datasets = {}
+
+            for file_or_dir in listdir(root_dir_path):
+                if path.isdir(path.join(root_dir_path, file_or_dir)):
+                    dataset_name = file_or_dir if parent_dir == "root" else parent_dir + "." + file_or_dir
+                    dataset_name = clean(dataset_name.replace(" ", "_"))
+
+                    nested_datasets = list_dir_files(path.join(root_dir_path, file_or_dir), dataset_name)
+
+                    for dataset in nested_datasets:
+                        datasets[dataset] = nested_datasets[dataset]
+                else:
+                    if parent_dir not in datasets:
+                        datasets[parent_dir] = []
+
+                    datasets[parent_dir].append(path.join(root_dir_path, file_or_dir))
+
+            return datasets
+
+        datasets = list_dir_files(file_paths)
+
+        results = []
+
+        for key in datasets:
+            if dataset_name is not None and not key.startswith(dataset_name):
+                continue
+
+            results.append(add_dlt(datasets[key], dataset_name = key))
+
+        return await asyncio.gather(*results)
+
+
+    db_path = get_absolute_path("./data/cognee")
+    db_location = f"{db_path}/cognee.duckdb"
+
+    LocalStorage.ensure_directory_exists(db_path)
+
+    db = duckdb.connect(db_location)
+
+    destination = dlt.destinations.duckdb(
+        credentials = db,
+    )
+
+    pipeline = dlt.pipeline(
+        pipeline_name = "file_load_from_filesystem",
+        destination = destination,
+    )
+
+    @dlt.resource(standalone = True, merge_key = "id")
+    def data_resources(file_paths: str):
+        for file_path in file_paths:
+            with open(file_path.replace("file://", ""), mode = "rb") as file:
+                classified_data = ingestion.classify(file)
+
+                data_id = ingestion.identify(classified_data)
+
+                file_metadata = get_file_metadata(classified_data.get_data())
+
+                yield {
+                    "id": data_id,
+                    "name": file_metadata["name"],
+                    "file_path": file_metadata["file_path"],
+                    "extension": file_metadata["extension"],
+                    "mime_type": file_metadata["mime_type"],
+                    "keywords": "|".join(file_metadata["keywords"]),
+                }
+
+    run_info = pipeline.run(
+        data_resources(file_paths),
+        table_name = "file_metadata",
+        dataset_name = dataset_name,
+        write_disposition = "merge",
+    )
+
+    return run_info
--- a/cognitive_architecture/api/v1/memory/remember.py
+++ b/cognitive_architecture/api/v1/memory/remember.py
--- a/cognitive_architecture/api/v1/memory/create_memory.py
+++ b/cognitive_architecture/api/v1/memory/create_memory.py
@ -1,32 +0,0 @@
-from enum import Enum
-from qdrant_client.models import Distance, VectorParams
-from cognitive_architecture.modules.memory.vector import create_vector_memory
-from cognitive_architecture.modules.users.memory import is_existing_memory, register_memory_for_user
-from cognitive_architecture.infrastructure.databases.vector.qdrant.adapter import CollectionConfig
-
-class MemoryType(Enum):
-    GRAPH = "GRAPH"
-    VECTOR = "VECTOR"
-    RELATIONAL = "RELATIONAL"
-
-class MemoryException(Exception):
-    message: str
-
-    def __init__(self, message: str):
-        self.message = message
-
-
-async def create_memory(user_id: str, memory_name: str, memory_type: MemoryType):
-    if await is_existing_memory(memory_name):
-        raise MemoryException(f'Memory with the name "{memory_name}" already exists. Memory names must be unique.')
-
-    match memory_type:
-        case MemoryType.VECTOR:
-            await create_vector_memory(memory_name, CollectionConfig(
-                vector_config = VectorParams(
-                    size = 1536,
-                    distance = Distance.DOT,
-                )
-            ))
-
-    await register_memory_for_user(user_id, memory_name)
--- a/cognitive_architecture/config.py
+++ b/cognitive_architecture/config.py
@ -27,7 +27,7 @@ class Config:
        )
    )

-    db_path = Path(__file__).resolve().parent / "database/data"
+    db_path = str(Path(__file__).resolve().parent / "data/system")

    vectordb: str = os.getenv("VECTORDB", "weaviate")
    qdrant_path: str = os.getenv("QDRANT_PATH")
--- a/cognitive_architecture/data/cognee/cognee.duckdb
+++ b/cognitive_architecture/data/cognee/cognee.duckdb
--- a/cognitive_architecture/data/cognee/cognee.duckdb.wal
+++ b/cognitive_architecture/data/cognee/cognee.duckdb.wal
--- a/cognitive_architecture/database/data/cognee.sqlite
+++ b/cognitive_architecture/database/data/cognee.sqlite
--- a/cognitive_architecture/database/database_manager.py
+++ b/cognitive_architecture/database/database_manager.py
@ -1,13 +1,11 @@
 """This module provides functionalities for creating and managing databases."""

 import os
-
 from contextlib import asynccontextmanager
-from sqlalchemy import create_engine, text
+from sqlalchemy import text
 from sqlalchemy.ext.asyncio import create_async_engine
-from config import Config
-from database.relationaldb.database import Base, get_sqlalchemy_database_url
-from database.relationaldb.models import memory, metadatas, operation, sessions, user, docs
+from cognitive_architecture.config import Config
+from cognitive_architecture.database.relationaldb.database import Base, get_sqlalchemy_database_url

 globalConfig = Config()

@ -58,10 +56,8 @@ class DatabaseManager:
    async def create_tables(self):
        """Create tables based on the SQLAlchemy Base metadata."""
        try:
-          async with self.engine.begin() as conn:
-              await conn.run_sync(Base.metadata.create_all)
+            async with self.engine.begin() as conn:
+                await conn.run_sync(Base.metadata.create_all)
        except Exception as e:
            print(e)
            raise e
-
-
--- a/cognitive_architecture/infrastructure/InfrastructureConfig.py
+++ b/cognitive_architecture/infrastructure/InfrastructureConfig.py
@ -0,0 +1,21 @@
+from cognitive_architecture.config import Config
+from .databases.relational import SqliteEngine, DatabaseEngine
+
+config = Config()
+config.load()
+
+class InfrastructureConfig():
+    database_engine: DatabaseEngine = None
+
+    def get_config(self) -> dict:
+        if self.database_engine is None:
+            self.database_engine = SqliteEngine(config.db_path, config.db_name)
+
+        return {
+            "database_engine": self.database_engine
+        }
+
+    def set_config(self, new_config: dict):
+        self.database_engine = new_config["database_engine"]
+
+infrastructure_config = InfrastructureConfig()
--- a/cognitive_architecture/infrastructure/init.py
+++ b/cognitive_architecture/infrastructure/init.py
@ -0,0 +1 @@
+from .InfrastructureConfig import infrastructure_config
--- a/cognitive_architecture/infrastructure/data/init.py
+++ b/cognitive_architecture/infrastructure/data/init.py
@ -0,0 +1,4 @@
+from .models.Data import Data
+from .models.Dataset import Dataset
+from .models.DatasetData import DatasetData
+from .add_data_to_dataset import add_data_to_dataset
--- a/cognitive_architecture/infrastructure/data/add_data_to_dataset.py
+++ b/cognitive_architecture/infrastructure/data/add_data_to_dataset.py
@ -0,0 +1,45 @@
+import logging
+from . import Dataset, Data
+from cognitive_architecture.infrastructure import infrastructure_config
+from cognitive_architecture.infrastructure.databases.relational import DatabaseEngine
+from cognitive_architecture.infrastructure.files import remove_file_from_storage
+
+logger = logging.getLogger(__name__)
+
+async def add_data_to_dataset(dataset: Dataset, data: Data):
+    db_engine: DatabaseEngine = infrastructure_config.get_config()["database_engine"]
+
+    existing_dataset = (await db_engine.query_entity(dataset)).scalar()
+    existing_data = (await db_engine.query_entity(data)).scalar()
+
+    if existing_dataset:
+        if existing_data:
+            await remove_old_raw_data(existing_data.raw_data_location)
+
+            def update_raw_data():
+                existing_data.raw_data_location = data.raw_data_location
+
+            await db_engine.update(update_raw_data)
+
+            if existing_dataset.id == dataset.id and dataset.name is not None:
+                def update_name(): existing_dataset.name = dataset.name
+                await db_engine.update(update_name)
+        else:
+            await db_engine.update(lambda: existing_dataset.data.append(data))
+    else:
+        if existing_data:
+            await remove_old_raw_data(existing_data.raw_data_location)
+
+            existing_data.raw_data_location = data.raw_data_location
+
+            await db_engine.update(lambda: dataset.data.append(existing_data))
+        else:
+            await db_engine.update(lambda: dataset.data.append(data))
+
+        await db_engine.create(dataset)
+
+async def remove_old_raw_data(data_location: str):
+    try:
+        await remove_file_from_storage(data_location)
+    except Exception:
+        logger.error("Failed to remove old raw data file: %s", data_location)
--- a/cognitive_architecture/infrastructure/data/models/Data.py
+++ b/cognitive_architecture/infrastructure/data/models/Data.py
@ -0,0 +1,23 @@
+from typing import List
+from datetime import datetime, timezone
+from sqlalchemy.orm import relationship, MappedColumn, Mapped
+from sqlalchemy import Column, String, DateTime, UUID, Text, JSON
+from cognitive_architecture.infrastructure.databases.relational import ModelBase
+from .DatasetData import DatasetData
+
+class Data(ModelBase):
+    __tablename__ = "data"
+
+    id = Column(UUID, primary_key = True)
+    name = Column(String, nullable = True)
+    description = Column(Text, nullable = True)
+    raw_data_location = Column(String)
+    meta_data: Mapped[dict] = MappedColumn(type_ = JSON) # metadata is reserved word
+
+    created_at = Column(DateTime, default = datetime.now(timezone.utc))
+    updated_at = Column(DateTime, onupdate = datetime.now(timezone.utc))
+
+    datasets: Mapped[List["Dataset"]] = relationship(
+        secondary = DatasetData.__tablename__,
+        back_populates = "data"
+    )
--- a/cognitive_architecture/infrastructure/data/models/Dataset.py
+++ b/cognitive_architecture/infrastructure/data/models/Dataset.py
@ -0,0 +1,21 @@
+from typing import List
+from datetime import datetime, timezone
+from sqlalchemy.orm import relationship, Mapped
+from sqlalchemy import Column, Text, DateTime, UUID
+from cognitive_architecture.infrastructure.databases.relational import ModelBase
+from .DatasetData import DatasetData
+
+class Dataset(ModelBase):
+    __tablename__ = "dataset"
+
+    id = Column(UUID, primary_key = True)
+    name = Column(Text)
+    description = Column(Text, nullable = True)
+
+    created_at = Column(DateTime, default = datetime.now(timezone.utc))
+    updated_at = Column(DateTime, onupdate = datetime.now(timezone.utc))
+
+    data: Mapped[List["Data"]] = relationship(
+        secondary = DatasetData.__tablename__,
+        back_populates = "datasets"
+    )
--- a/cognitive_architecture/infrastructure/data/models/DatasetData.py
+++ b/cognitive_architecture/infrastructure/data/models/DatasetData.py
@ -0,0 +1,14 @@
+from uuid import uuid4
+from datetime import datetime, timezone
+from sqlalchemy import Column, DateTime, UUID, ForeignKey
+from cognitive_architecture.infrastructure.databases.relational import ModelBase
+
+class DatasetData(ModelBase):
+    __tablename__ = "dataset_data"
+
+    id = Column(UUID, primary_key = True, default = uuid4())
+
+    created_at = Column(DateTime, default = datetime.now(timezone.utc))
+
+    dataset_id = Column("dataset", UUID, ForeignKey("dataset.id"), primary_key = True)
+    data_id = Column("data", UUID, ForeignKey("data.id"), primary_key = True)
--- a/cognitive_architecture/infrastructure/data/models/init.py
+++ b/cognitive_architecture/infrastructure/data/models/init.py
--- a/cognitive_architecture/infrastructure/databases/relational/DatabaseEngine.py
+++ b/cognitive_architecture/infrastructure/databases/relational/DatabaseEngine.py
@ -0,0 +1,23 @@
+from typing import Protocol
+
+class DatabaseEngine(Protocol):
+    async def ensure_tables(self):
+        pass
+
+    def database_exists(self, db_name: str) -> bool:
+        pass
+
+    def create_database(self, db_name: str):
+        pass
+
+    def drop_database(self, db_name: str):
+        pass
+
+    async def table_exists(self, table_name: str) -> bool:
+        pass
+
+    async def create_tables(self):
+        pass
+
+    async def create(self, data):
+        pass
--- a/cognitive_architecture/infrastructure/databases/relational/ModelBase.py
+++ b/cognitive_architecture/infrastructure/databases/relational/ModelBase.py
@ -0,0 +1,3 @@
+from sqlalchemy.orm import declarative_base
+
+ModelBase = declarative_base()
--- a/cognitive_architecture/infrastructure/databases/relational/init.py
+++ b/cognitive_architecture/infrastructure/databases/relational/init.py
@ -0,0 +1,3 @@
+from .ModelBase import ModelBase
+from .DatabaseEngine import DatabaseEngine
+from .sqlite.SqliteEngine import SqliteEngine
--- a/cognitive_architecture/infrastructure/databases/relational/get_database.py
+++ b/cognitive_architecture/infrastructure/databases/relational/get_database.py
@ -1,4 +0,0 @@
-from .general.adapter import RelationalDBAdapter
-
-def get_database():
-    return RelationalDBAdapter()
--- a/cognitive_architecture/infrastructure/databases/relational/sqlite/SqliteEngine.py
+++ b/cognitive_architecture/infrastructure/databases/relational/sqlite/SqliteEngine.py
@ -0,0 +1,82 @@
+import os
+import asyncio
+from typing import Callable
+from sqlalchemy.inspection import inspect
+from sqlalchemy.ext.asyncio import create_async_engine, async_sessionmaker, AsyncEngine, AsyncSession, async_scoped_session
+from sqlalchemy.future import select
+from cognitive_architecture.infrastructure.files.storage.LocalStorage import LocalStorage
+from ..DatabaseEngine import DatabaseEngine
+from ..ModelBase import ModelBase
+from ..utils import with_rollback
+
+class SqliteEngine(DatabaseEngine):
+    db_path: str = None
+    db_name: str = None
+    engine: AsyncEngine = None
+    session_maker: Callable[[], async_scoped_session[AsyncSession]] = None
+    is_db_done: bool = False
+
+    def __init__(self, db_path: str, db_name: str):
+        self.db_path = db_path
+        self.db_name = db_name
+        self.db_location = db_path + "/" + db_name
+        self.engine = create_async_engine(
+            f"sqlite+aiosqlite:///{self.db_location}",
+            pool_recycle = 3600,
+            echo = False
+        )
+        self.session_maker = lambda: async_scoped_session(
+            async_sessionmaker(
+                bind = self.engine,
+                class_ = AsyncSession
+            ),
+            scopefunc = asyncio.current_task
+        )
+
+    async def ensure_tables(self):
+        if not self.database_exists(self.db_name):
+            self.create_database(self.db_name)
+
+            await self.create_tables()
+
+            self.is_db_done = True
+
+            return True
+
+    def database_exists(self, db_name: str) -> bool:
+        return os.path.exists(self.db_path + "/" + db_name)
+
+    def create_database(self, db_name: str):
+        LocalStorage.ensure_directory_exists(self.db_path)
+
+        with open(self.db_path + "/" + db_name, mode = "w+", encoding = "utf-8") as file:
+            file.write("")
+
+    def drop_database(self, db_name: str):
+        os.remove(self.db_location)
+
+    async def table_exists(self, table_name: str) -> bool:
+        return inspect(self.engine).has_table(table_name)
+
+    async def create_tables(self):
+        async with self.engine.begin() as connection:
+            return await connection.run_sync(ModelBase.metadata.create_all)
+
+    async def create(self, data):
+        async with with_rollback(self.session_maker()) as session:
+            session.add(data)
+
+    async def query(self, query_term):
+        async with with_rollback(self.session_maker()) as session:
+            return await session.execute(query_term)
+
+    async def query_entity(self, entity):
+        async with with_rollback(self.session_maker()) as session:
+            return await session.execute(
+                select(type(entity))
+                    .where(type(entity).id == entity.id)
+            )
+
+    async def update(self, data_update_fn):
+        async with with_rollback(self.session_maker()):
+            data_update_fn()
--- a/cognitive_architecture/infrastructure/databases/relational/sqlite/init.py
+++ b/cognitive_architecture/infrastructure/databases/relational/sqlite/init.py
--- a/cognitive_architecture/infrastructure/databases/relational/utils/init.py
+++ b/cognitive_architecture/infrastructure/databases/relational/utils/init.py
@ -0,0 +1 @@
+from .with_rollback import with_rollback
--- a/cognitive_architecture/infrastructure/databases/relational/utils/with_rollback.py
+++ b/cognitive_architecture/infrastructure/databases/relational/utils/with_rollback.py
@ -0,0 +1,18 @@
+import logging
+from contextlib import asynccontextmanager
+from sqlalchemy.ext.asyncio import async_scoped_session
+logger = logging.getLogger(__name__)
+
+@asynccontextmanager
+async def with_rollback(session: async_scoped_session):
+    """Provide a transactional scope around a series of operations."""
+
+    try:
+        # async with session.begin():
+        yield session
+        await session.commit()
+        await session.remove()
+    except Exception as exception:
+        await session.rollback()
+        logger.error("Session rolled back due to: %s", str(exception))
+        raise exception
--- a/cognitive_architecture/infrastructure/databases/vector/init.py
+++ b/cognitive_architecture/infrastructure/databases/vector/init.py
@ -0,0 +1,2 @@
+from .get_vector_database import get_vector_database
+from .qdrant import QDrantAdapter, CollectionConfig
--- a/cognitive_architecture/infrastructure/databases/vector/qdrant/QDrantAdapter.py
+++ b/cognitive_architecture/infrastructure/databases/vector/qdrant/QDrantAdapter.py
@ -59,3 +59,13 @@ class QDrantAdapter(VectorDBInterface):
            collection_name = collection_name,
            points = data_points
        )
+
+    async def find_related_data_points(self, collection_name: str, query_vector):
+        client = self.get_qdrant_client()
+
+        return await client.search(
+            collection_name = collection_name,
+            query_vector = query_vector,
+            with_payload = True,
+            score_threshold = 0.8
+        )
--- a/cognitive_architecture/infrastructure/databases/vector/qdrant/init.py
+++ b/cognitive_architecture/infrastructure/databases/vector/qdrant/init.py
@ -1 +1 @@
-from .adapter import QDrantAdapter
+from .QDrantAdapter import QDrantAdapter, CollectionConfig
--- a/cognitive_architecture/infrastructure/databases/vector/vector_db_interface.py
+++ b/cognitive_architecture/infrastructure/databases/vector/vector_db_interface.py
@ -46,6 +46,13 @@ class VectorDBInterface(Protocol):
        data_points: List[any]
    ): raise NotImplementedError

+    @abstractmethod
+    async def find_related_data_points(
+        self,
+        collection_name: str,
+        query_vector: any
+    ): raise NotImplementedError
+
    # @abstractmethod
    # async def get_data_point(
    #     self,
--- a/cognitive_architecture/infrastructure/files/init.py
+++ b/cognitive_architecture/infrastructure/files/init.py
@ -0,0 +1,3 @@
+from .add_file_to_storage import add_file_to_storage
+from .remove_file_from_storage import remove_file_from_storage
+from .utils.get_file_metadata import get_file_metadata, FileMetadata
--- a/cognitive_architecture/infrastructure/files/add_file_to_storage.py
+++ b/cognitive_architecture/infrastructure/files/add_file_to_storage.py
@ -0,0 +1,9 @@
+from typing import BinaryIO
+from cognitive_architecture.root_dir import get_absolute_path
+from .storage.StorageManager import StorageManager
+from .storage.LocalStorage import LocalStorage
+
+async def add_file_to_storage(file_path: str, file: BinaryIO):
+    storage_manager = StorageManager(LocalStorage(get_absolute_path("data/files")))
+
+    storage_manager.store(file_path, file)
--- a/cognitive_architecture/infrastructure/files/remove_file_from_storage.py
+++ b/cognitive_architecture/infrastructure/files/remove_file_from_storage.py
@ -0,0 +1,8 @@
+from cognitive_architecture.root_dir import get_absolute_path
+from .storage.StorageManager import StorageManager
+from .storage.LocalStorage import LocalStorage
+
+async def remove_file_from_storage(file_path: str):
+    storage_manager = StorageManager(LocalStorage(get_absolute_path("data/files")))
+
+    storage_manager.remove(file_path)
--- a/cognitive_architecture/infrastructure/files/storage/LocalStorage.py
+++ b/cognitive_architecture/infrastructure/files/storage/LocalStorage.py
@ -0,0 +1,37 @@
+import os
+from typing import BinaryIO
+from .StorageManager import Storage
+
+class LocalStorage(Storage):
+    storage_path: str = None
+
+    def __init__(self, storage_path: str):
+        self.storage_path = storage_path
+
+    def store(self, file_path: str, data: BinaryIO):
+        full_file_path = self.storage_path + "/" + file_path
+
+        LocalStorage.ensure_directory_exists(self.storage_path)
+
+        with open(full_file_path, "wb") as f:
+            f.write(data.read())
+
+    def retrieve(self, file_path: str):
+        full_file_path = self.storage_path + "/" + file_path
+
+        with open(full_file_path, "rb") as f:
+            return f.read()
+
+    @staticmethod
+    def ensure_directory_exists(file_path: str):
+        if not os.path.exists(file_path):
+            os.makedirs(file_path)
+
+    def remove(self, file_path: str):
+        os.remove(self.storage_path + "/" + file_path)
+
+    # def get_directory(self, file_path: str):
+    #     [path, __] = file_path.split(".")
+    #     directory = "/".join(path.split("/")[:-1])
+
+    #     return directory if directory != "" else None
--- a/cognitive_architecture/infrastructure/files/storage/StorageManager.py
+++ b/cognitive_architecture/infrastructure/files/storage/StorageManager.py
@ -0,0 +1,26 @@
+from typing import Protocol, BinaryIO
+
+class Storage(Protocol):
+    def store(self, file_path: str, data: bytes):
+        pass
+
+    def retrieve(self, file_path: str):
+        pass
+
+    def remove(self, file_path: str):
+        pass
+
+class StorageManager():
+    storage: Storage = None
+
+    def __init__ (self, storage: Storage):
+        self.storage = storage
+
+    def store(self, file_path: str, data: BinaryIO):
+        return self.storage.store(file_path, data)
+
+    def retrieve(self, file_path: str):
+        return self.storage.retrieve(file_path)
+
+    def remove(self, file_path: str):
+        return self.storage.remove(file_path)
--- a/cognitive_architecture/infrastructure/files/storage/init.py
+++ b/cognitive_architecture/infrastructure/files/storage/init.py
@ -0,0 +1 @@
+from .LocalStorage import LocalStorage
--- a/cognitive_architecture/infrastructure/files/utils/init.py
+++ b/cognitive_architecture/infrastructure/files/utils/init.py
--- a/cognitive_architecture/infrastructure/files/utils/extract_keywords.py
+++ b/cognitive_architecture/infrastructure/files/utils/extract_keywords.py
@ -0,0 +1,27 @@
+import nltk
+from sklearn.feature_extraction.text import TfidfVectorizer
+
+def extract_keywords(text: str) -> list[str]:
+    tokens = nltk.word_tokenize(text)
+
+    tags = nltk.pos_tag(tokens)
+    nouns = [word for (word, tag) in tags if tag == "NN"]
+
+    vectorizer = TfidfVectorizer()
+    tfidf = vectorizer.fit_transform(nouns)
+
+    top_nouns = sorted(
+        vectorizer.vocabulary_,
+        key = lambda x: tfidf[0, vectorizer.vocabulary_[x]],
+        reverse = True
+    )
+
+    keywords = []
+
+    for word in top_nouns:
+        if len(word) > 3:
+            keywords.append(word)
+        if len(keywords) >= 15:
+            break
+
+    return keywords
--- a/cognitive_architecture/infrastructure/files/utils/get_file_metadata.py
+++ b/cognitive_architecture/infrastructure/files/utils/get_file_metadata.py
@ -0,0 +1,44 @@
+from typing import BinaryIO, TypedDict
+import filetype
+from unstructured.cleaners.core import clean
+from unstructured.partition.pdf import partition_pdf
+from .extract_keywords import extract_keywords
+
+
+class FileTypeException(Exception):
+    message: str
+
+    def __init__(self, message: str):
+        self.message = message
+
+
+class FileMetadata(TypedDict):
+    name: str
+    mime_type: str
+    extension: str
+    keywords: list[str]
+
+def get_file_metadata(file: BinaryIO) -> FileMetadata:
+    file_type = filetype.guess(file)
+
+    if file_type is None:
+        raise FileTypeException("Unknown file detected.")
+
+    keywords: list = []
+
+    if file_type.extension == "pdf":
+        elements = partition_pdf(file = file, strategy = "fast")
+        keywords = extract_keywords(
+            "\n".join(map(lambda element: clean(element.text), elements))
+        )
+
+    file_path = file.name
+    file_name = file_path.split("/")[-1].split(".")[0]
+
+    return FileMetadata(
+        name = file_name,
+        file_path = file_path,
+        mime_type = file_type.mime,
+        extension = file_type.extension,
+        keywords = keywords
+    )
--- a/cognitive_architecture/infrastructure/files/utils/get_file_size.py
+++ b/cognitive_architecture/infrastructure/files/utils/get_file_size.py
@ -0,0 +1,4 @@
+import os
+
+def get_file_size(file_path: str):
+    return os.path.getsize(file_path)
--- a/cognitive_architecture/infrastructure/pipeline/models/Operation.py
+++ b/cognitive_architecture/infrastructure/pipeline/models/Operation.py
@ -0,0 +1,27 @@
+from datetime import datetime
+from sqlalchemy.orm import Mapped, MappedColumn
+from sqlalchemy import Column, String, DateTime, ForeignKey, Enum, UUID, JSON
+from cognitive_architecture.infrastructure.databases.relational import ModelBase
+
+class OperationType(Enum):
+    MERGE_DATA = "MERGE_DATA"
+    APPEND_DATA = "APPEND_DATA"
+
+class OperationStatus(Enum):
+    STARTED = "OPERATION_STARTED"
+    IN_PROGRESS = "OPERATION_IN_PROGRESS"
+    COMPLETE = "OPERATION_COMPLETE"
+    ERROR = "OPERATION_ERROR"
+    CANCELLED = "OPERATION_CANCELLED"
+
+class Operation(ModelBase):
+    __tablename__ = "operation"
+
+    id = Column(String, primary_key = True)
+    status = Column(Enum(OperationStatus))
+    operation_type = Column(Enum(OperationType))
+
+    data_id = Column(UUID, ForeignKey("data.id"))
+    meta_data: Mapped[dict] = MappedColumn(type_ = JSON)
+
+    created_at = Column(DateTime, default = datetime.utcnow)
--- a/cognitive_architecture/infrastructure/pipeline/models/_init_.py
+++ b/cognitive_architecture/infrastructure/pipeline/models/_init_.py
--- a/cognitive_architecture/modules/ingestion/init.py
+++ b/cognitive_architecture/modules/ingestion/init.py
@ -0,0 +1,3 @@
+from .classify import classify
+from .identify import identify
+from .save import save
--- a/cognitive_architecture/modules/ingestion/classify.py
+++ b/cognitive_architecture/modules/ingestion/classify.py
@ -0,0 +1,13 @@
+from io import BufferedReader
+from typing import Union, BinaryIO
+from .exceptions import IngestionException
+from .data_types import create_text_data, create_binary_data
+
+def classify(data: Union[str, BinaryIO]):
+    if isinstance(data, str):
+        return create_text_data(data)
+
+    if isinstance(data, BufferedReader):
+        return create_binary_data(data)
+
+    raise IngestionException(f"Data sent to cognee.classify(data: any) not supported: {type(data)}")
--- a/cognitive_architecture/modules/ingestion/data_types/BinaryData.py
+++ b/cognitive_architecture/modules/ingestion/data_types/BinaryData.py
@ -0,0 +1,30 @@
+from typing import BinaryIO
+from cognitive_architecture.infrastructure.files import get_file_metadata, FileMetadata
+from .IngestionData import IngestionData
+
+def create_binary_data(data: BinaryIO):
+    return BinaryData(data)
+
+class BinaryData(IngestionData):
+    data: BinaryIO = None
+    metadata: FileMetadata = None
+
+    def __init__(self, data: BinaryIO):
+        self.data = data
+
+    def get_identifier(self):
+        self.ensure_metadata()
+
+        return self.metadata["mime_type"] + "_" + "|".join(self.metadata["keywords"])
+
+    def get_extension(self):
+        self.ensure_metadata()
+
+        return self.metadata["extension"]
+
+    def ensure_metadata(self):
+        if self.metadata is None:
+            self.metadata = get_file_metadata(self.data)
+
+    def get_data(self):
+        return self.data
--- a/cognitive_architecture/modules/ingestion/data_types/IngestionData.py
+++ b/cognitive_architecture/modules/ingestion/data_types/IngestionData.py
@ -0,0 +1,11 @@
+from typing import Protocol, BinaryIO
+
+class IngestionData(Protocol):
+    data: str | BinaryIO = None
+    metadata: dict = None
+
+    def get_data(self):
+        pass
+
+    def get_extension(self):
+        pass
--- a/cognitive_architecture/modules/ingestion/data_types/TextData.py
+++ b/cognitive_architecture/modules/ingestion/data_types/TextData.py
@ -0,0 +1,16 @@
+from .IngestionData import IngestionData
+
+def create_text_data(data: str):
+    return TextData(data)
+
+class TextData(IngestionData):
+    data: str = None
+
+    def __init__(self, data: str):
+        self.data = data
+
+    def get_data(self):
+        return self.data
+
+    def get_chunks(self):
+        pass
--- a/cognitive_architecture/modules/ingestion/data_types/init.py
+++ b/cognitive_architecture/modules/ingestion/data_types/init.py
@ -0,0 +1,3 @@
+from .TextData import TextData, create_text_data
+from .BinaryData import BinaryData, create_binary_data
+from .IngestionData import IngestionData
--- a/cognitive_architecture/modules/ingestion/exceptions.py
+++ b/cognitive_architecture/modules/ingestion/exceptions.py
@ -0,0 +1,6 @@
+
+class IngestionException(Exception):
+    message: str
+
+    def __init__(self, message: str):
+        self.message = message
--- a/cognitive_architecture/modules/ingestion/identify.py
+++ b/cognitive_architecture/modules/ingestion/identify.py
@ -0,0 +1,9 @@
+from uuid import UUID, uuid5
+from .data_types import IngestionData
+
+null_uuid: UUID = UUID("00000000-0000-0000-0000-000000000000")
+
+def identify(data: IngestionData) -> UUID:
+    data_id: str = data.get_identifier()
+
+    return uuid5(null_uuid, data_id)
--- a/cognitive_architecture/modules/ingestion/save.py
+++ b/cognitive_architecture/modules/ingestion/save.py
@ -0,0 +1,29 @@
+import asyncio
+from uuid import UUID, uuid4
+from cognitive_architecture.infrastructure.files import add_file_to_storage
+from cognitive_architecture.infrastructure.data import add_data_to_dataset, Data, Dataset
+from .data_types import IngestionData
+
+async def save(dataset_id: UUID, dataset_name: str, data_id: UUID, data: IngestionData):
+    file_path = uuid4().hex + "." + data.get_extension()
+
+    promises = []
+
+    # promises.append(add_file_to_storage(file_path, data.get_data()))
+
+    promises.append(
+        add_data_to_dataset(
+            Dataset(
+                id = dataset_id,
+                name = dataset_name if dataset_name else dataset_id.hex
+            ),
+            Data(
+                id = data_id,
+                raw_data_location = file_path,
+                name = data.metadata["name"],
+                meta_data = data.metadata
+            )
+        )
+    )
+
+    await asyncio.gather(*promises)
--- a/cognitive_architecture/root_dir.py
+++ b/cognitive_architecture/root_dir.py
@ -0,0 +1,6 @@
+from os import path
+
+ROOT_DIR = path.dirname(path.abspath(__file__))
+
+def get_absolute_path(path_from_root: str) -> str:
+    return path.abspath(path.join(ROOT_DIR, path_from_root))
--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@ -17,7 +17,7 @@ classifiers = [
    "Operating System :: Microsoft :: Windows",]

 [tool.poetry.dependencies]
-python = "^3.10"
+python = "3.10.13"
 langchain = "^0.0.338"
 openai = "1.12.0"
 python-dotenv = "1.0.1"
@ -40,6 +40,12 @@ pymupdf = "^1.23.25"
 pandas = "^2.2.1"
 greenlet = "^3.0.3"
 ruff = "^0.2.2"
+filetype = "^1.2.0"
+unstructured = {extras = ["all-docs"], version = "^0.12.5"}
+nltk = "^3.8.1"
+scikit-learn = "^1.4.1.post1"
+dlt = "^0.4.6"
+duckdb = {version = "^0.10.0", extras = ["dlt"]}

 [tool.poetry.extras]
 dbt = ["dbt-core", "dbt-redshift", "dbt-bigquery", "dbt-duckdb", "dbt-snowflake", "dbt-athena-community", "dbt-databricks"]
@ -73,6 +79,7 @@ pytest = "^7.4.0"
 pytest-asyncio = "^0.21.1"
 coverage = "^7.3.2"
 mypy = "^1.7.1"
+notebook = "^7.1.1"

 [tool.poetry.group.docs.dependencies]
 mkdocs = "^1.4.3"
--- a/vector_retrieval_demo.ipynb
+++ b/vector_retrieval_demo.ipynb
@ -0,0 +1,525 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "50d5afda-418f-436b-b467-004863193d4a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "articles = [\n",
+    "\"\"\"Edward VII (Albert Edward; 9 November 1841 – 6 May 1910) was King of the United Kingdom and the British Dominions, and Emperor of India, from 22 January 1901 until his death in 1910.\n",
+    "The second child and eldest son of Queen Victoria and Prince Albert of Saxe-Coburg and Gotha, Edward, nicknamed \"Bertie\", was related to royalty throughout Europe. He was Prince of Wales and heir apparent to the British throne for almost 60 years. During his mother's reign, he was largely excluded from political influence and came to personify the fashionable, leisured elite. He married Princess Alexandra of Denmark in 1863, and the couple had six children. As Prince of Wales, Edward travelled throughout Britain performing ceremonial public duties and represented Britain on visits abroad. His tours of North America in 1860 and of the Indian subcontinent in 1875 proved popular successes, but despite public approval, his reputation as a playboy prince soured his relationship with his mother.\n",
+    "Edward inherited the throne upon his mother's death in 1901. The King played a role in the modernisation of the British Home Fleet and the reorganisation of the British Army after the Second Boer War of 1899–1902. He re-instituted traditional ceremonies as public displays and broadened the range of people with whom royalty socialised. He fostered good relations between Britain and other European countries, especially France, for which he was popularly called \"Peacemaker\", but his relationship with his nephew, German Emperor Wilhelm II, was poor. The Edwardian era, which covered Edward's reign and was named after him, coincided with the start of a new century and heralded significant changes in technology and society, including steam turbine propulsion and the rise of socialism. He died in 1910 in the midst of a constitutional crisis that was resolved the following year by the Parliament Act 1911, which restricted the power of the unelected House of Lords. Edward was succeeded by his only surviving son, George V.\"\"\",\n",
+    "    \"\"\"George V (George Frederick Ernest Albert; 3 June 1865 – 20 January 1936) was King of the United Kingdom and the British Dominions, and Emperor of India, from 6 May 1910 until his death in 1936.\n",
+    "George was born during the reign of his paternal grandmother, Queen Victoria, as the second son of the Prince and Princess of Wales (later King Edward VII and Queen Alexandra). He was third in the line of succession to the British throne behind his father and his elder brother, Prince Albert Victor. From 1877 to 1892, George served in the Royal Navy, until his elder brother's unexpected death in January 1892 put him directly in line for the throne. The next year, George married his brother's fiancée, Princess Victoria Mary of Teck, and they had six children. When Queen Victoria died in 1901, George's father ascended the throne as Edward VII, and George was created Prince of Wales. He became king-emperor on his father's death in 1910.\n",
+    "George's reign saw the rise of socialism, communism, fascism, Irish republicanism, and the Indian independence movement, all of which radically changed the political landscape of the British Empire, which itself reached its territorial peak by the beginning of the 1920s. The Parliament Act 1911 established the supremacy of the elected British House of Commons over the unelected House of Lords. As a result of the First World War (1914–1918), the empires of his first cousins Nicholas II of Russia and Wilhelm II of Germany fell, while the British Empire expanded to its greatest effective extent. In 1917, George became the first monarch of the House of Windsor, which he renamed from the House of Saxe-Coburg and Gotha as a result of anti-German public sentiment. He appointed the first Labour ministry in 1924, and the 1931 Statute of Westminster recognised the Empire's Dominions as separate, independent states within the British Commonwealth of Nations.\n",
+    "George suffered from smoking-related health problems during his later reign. On his death in January 1936, he was succeeded by his eldest son, Edward VIII. Edward abdicated in December of that year and was succeeded by his younger brother Albert, who took the regnal name George VI.\"\"\",\n",
+    "\"\"\"Edward VIII (Edward Albert Christian George Andrew Patrick David; 23 June 1894 – 28 May 1972), later known as the Duke of Windsor, was King of the United Kingdom and the Dominions of the British Empire, and Emperor of India, from 20 January 1936 until his abdication in December of the same year.[a]\n",
+    "Edward was born during the reign of his great-grandmother Queen Victoria as the eldest child of the Duke and Duchess of York, later King George V and Queen Mary. He was created Prince of Wales on his 16th birthday, seven weeks after his father succeeded as king. As a young man, Edward served in the British Army during the First World War and undertook several overseas tours on behalf of his father. The Prince of Wales gained popularity due to his charm and charisma, and his fashion sense became a hallmark of the era. After the war, his conduct began to give cause for concern; he engaged in a series of sexual affairs that worried both his father and the British prime minister, Stanley Baldwin.\n",
+    "Upon his father's death in 1936, Edward became the second monarch of the House of Windsor. The new king showed impatience with court protocol, and caused consternation among politicians by his apparent disregard for established constitutional conventions. Only months into his reign, a constitutional crisis was caused by his proposal to marry Wallis Simpson, an American who had divorced her first husband and was seeking a divorce from her second. The prime ministers of the United Kingdom and the Dominions opposed the marriage, arguing a divorced woman with two living ex-husbands was politically and socially unacceptable as a prospective queen consort. Additionally, such a marriage would have conflicted with Edward's status as titular head of the Church of England, which, at the time, disapproved of remarriage after divorce if a former spouse was still alive. Edward knew the Baldwin government would resign if the marriage went ahead, which could have forced a general election and would have ruined his status as a politically neutral constitutional monarch. When it became apparent he could not marry Simpson and remain on the throne, he abdicated. He was succeeded by his younger brother, George VI. With a reign of 326 days, Edward was one of the shortest-reigning British monarchs to date.\n",
+    "After his abdication, Edward was created Duke of Windsor. He married Simpson in France on 3 June 1937, after her second divorce became final. Later that year, the couple toured Nazi Germany, which fed rumours that he was a Nazi sympathiser. During the Second World War, Edward was at first stationed with the British Military Mission to France but after the fall of France was appointed Governor of the Bahamas. After the war, Edward spent the rest of his life in France. He and Wallis remained married until his death in 1972; they had no children.\"\"\",\n",
+    "\"\"\"George VI (Albert Frederick Arthur George; 14 December 1895 – 6 February 1952) was King of the United Kingdom and the Dominions of the British Commonwealth from 11 December 1936 until his death on 6 February 1952. He was also the last Emperor of India from 1936 until the British Raj was dissolved in August 1947, and the first head of the Commonwealth following the London Declaration of 1949.\n",
+    "The future George VI was born during the reign of his great-grandmother Queen Victoria; he was named Albert at birth after his great-grandfather Prince Albert of Saxe-Coburg and Gotha and was known as \"Bertie\" to his family and close friends. His father ascended the throne as George V in 1910. As the second son of the king, Albert was not expected to inherit the throne. He spent his early life in the shadow of his elder brother, Edward, the heir apparent. Albert attended naval college as a teenager and served in the Royal Navy and Royal Air Force during the First World War. In 1920, he was made Duke of York. He married Lady Elizabeth Bowes-Lyon in 1923, and they had two daughters, Elizabeth and Margaret. In the mid-1920s, he engaged speech therapist Lionel Logue to treat his stutter, which he learned to manage to some degree. His elder brother ascended the throne as Edward VIII after their father died in 1936, but Edward abdicated later that year to marry the twice-divorced American socialite Wallis Simpson. As heir presumptive to Edward VIII, Albert became king, taking the regnal name George VI.\n",
+    "In September 1939, the British Empire and most Commonwealth countries—but not Ireland—declared war on Nazi Germany, following the invasion of Poland. War with the Kingdom of Italy and the Empire of Japan followed in 1940 and 1941, respectively. George VI was seen as sharing the hardships of the common people and his popularity soared. Buckingham Palace was bombed during the Blitz while the King and Queen were there, and his younger brother the Duke of Kent was killed on active service. George became known as a symbol of British determination to win the war. Britain and its allies were victorious in 1945, but the British Empire declined. Ireland had largely broken away, followed by the independence of India and Pakistan in 1947. George relinquished the title of Emperor of India in June 1948 and instead adopted the new title of Head of the Commonwealth. He was beset by smoking-related health problems in the later years of his reign and died at Sandringham House, aged 56, of a coronary thrombosis in 1952. He was succeeded by his elder daughter, Elizabeth II.\"\"\",\n",
+    "\"\"\"Elizabeth II (Elizabeth Alexandra Mary; 21 April 1926 – 8 September 2022) was Queen of the United Kingdom and other Commonwealth realms from 6 February 1952 until her death in 2022. She was queen regnant of 32 sovereign states over the course of her lifetime and remained the monarch of 15 realms by the time of her death. Her reign of over 70 years is the longest of any British monarch, the longest of any female monarch, and the second longest verified reign of any monarch of a sovereign state in history.\n",
+    "Elizabeth was born in Mayfair, London, during the reign of her paternal grandfather, King George V. She was the first child of the Duke and Duchess of York (later King George VI and Queen Elizabeth The Queen Mother). Her father acceded to the throne in 1936 upon the abdication of his brother Edward VIII, making the ten-year-old Princess Elizabeth the heir presumptive. She was educated privately at home and began to undertake public duties during the Second World War, serving in the Auxiliary Territorial Service. In November 1947, she married Philip Mountbatten, a former prince of Greece and Denmark, and their marriage lasted 73 years until his death in 2021. They had four children: Charles, Anne, Andrew, and Edward.\n",
+    "When her father died in February 1952, Elizabeth—then 25 years old—became queen of seven independent Commonwealth countries: the United Kingdom, Canada, Australia, New Zealand, South Africa, Pakistan, and Ceylon (known today as Sri Lanka), as well as head of the Commonwealth. Elizabeth reigned as a constitutional monarch through major political changes such as the Troubles in Northern Ireland, devolution in the United Kingdom, the decolonisation of Africa, and the United Kingdom's accession to the European Communities, as well as its subsequent withdrawal. The number of her realms varied over time as territories gained independence and some realms became republics. As queen, Elizabeth was served by more than 170 prime ministers across her realms. Her many historic visits and meetings included state visits to China in 1986, to Russia in 1994, and to the Republic of Ireland in 2011, and meetings with five popes and fourteen US presidents.\n",
+    "Significant events included Elizabeth's coronation in 1953 and the celebrations of her Silver, Golden, Diamond, and Platinum jubilees in 1977, 2002, 2012, and 2022, respectively. Although she faced occasional republican sentiment and media criticism of her family—particularly after the breakdowns of her children's marriages, her annus horribilis in 1992, and the death in 1997 of her former daughter-in-law Diana—support for the monarchy in the United Kingdom remained consistently high throughout her lifetime, as did her personal popularity. Elizabeth died aged 96 at Balmoral Castle in September 2022, and was succeeded by her eldest son, Charles III.\"\"\"\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "e328a903-d084-4d07-9b95-0a9196d7f719",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "categories = {\n",
+    "    \"Natural Language Text\": {\n",
+    "        \"type\": \"TEXT\",\n",
+    "        \"subclass\": [\n",
+    "            \"Articles, essays, and reports\",\n",
+    "            \"Books and manuscripts\",\n",
+    "            \"News stories and blog posts\",\n",
+    "            \"Research papers and academic publications\",\n",
+    "            \"Social media posts and comments\",\n",
+    "            \"Website content and product descriptions\",\n",
+    "            \"Personal narratives and stories\"\n",
+    "        ]\n",
+    "    },\n",
+    "    \"Structured Documents\": {\n",
+    "        \"type\": \"TEXT\",\n",
+    "        \"subclass\": [\n",
+    "            \"Spreadsheets and tables\",\n",
+    "            \"Forms and surveys\",\n",
+    "            \"Databases and CSV files\"\n",
+    "        ]\n",
+    "    },\n",
+    "    \"Code and Scripts\": {\n",
+    "        \"type\": \"TEXT\",\n",
+    "        \"subclass\": [\n",
+    "            \"Source code in various programming languages\",\n",
+    "            \"Shell commands and scripts\",\n",
+    "            \"Markup languages (HTML, XML)\",\n",
+    "            \"Stylesheets (CSS) and configuration files (YAML, JSON, INI)\"\n",
+    "        ]\n",
+    "    },\n",
+    "    \"Conversational Data\": {\n",
+    "        \"type\": \"TEXT\",\n",
+    "        \"subclass\": [\n",
+    "            \"Chat transcripts and messaging history\",\n",
+    "            \"Customer service logs and interactions\",\n",
+    "            \"Conversational AI training data\"\n",
+    "        ]\n",
+    "    },\n",
+    "    \"Educational Content\": {\n",
+    "        \"type\": \"TEXT\",\n",
+    "        \"subclass\": [\n",
+    "            \"Textbook content and lecture notes\",\n",
+    "            \"Exam questions and academic exercises\",\n",
+    "            \"E-learning course materials\"\n",
+    "        ]\n",
+    "    },\n",
+    "    \"Creative Writing\": {\n",
+    "        \"type\": \"TEXT\",\n",
+    "        \"subclass\": [\n",
+    "            \"Poetry and prose\",\n",
+    "            \"Scripts for plays, movies, and television\",\n",
+    "            \"Song lyrics\"\n",
+    "        ]\n",
+    "    },\n",
+    "    \"Technical Documentation\": {\n",
+    "        \"type\": \"TEXT\",\n",
+    "        \"subclass\": [\n",
+    "            \"Manuals and user guides\",\n",
+    "            \"Technical specifications and API documentation\",\n",
+    "            \"Helpdesk articles and FAQs\"\n",
+    "        ]\n",
+    "    },\n",
+    "    \"Legal and Regulatory Documents\": {\n",
+    "        \"type\": \"TEXT\",\n",
+    "        \"subclass\": [\n",
+    "            \"Contracts and agreements\",\n",
+    "            \"Laws, regulations, and legal case documents\",\n",
+    "            \"Policy documents and compliance materials\"\n",
+    "        ]\n",
+    "    },\n",
+    "    \"Medical and Scientific Texts\": {\n",
+    "        \"type\": \"TEXT\",\n",
+    "        \"subclass\": [\n",
+    "            \"Clinical trial reports\",\n",
+    "            \"Patient records and case notes\",\n",
+    "            \"Scientific journal articles\"\n",
+    "        ]\n",
+    "    },\n",
+    "    \"Financial and Business Documents\": {\n",
+    "        \"type\": \"TEXT\",\n",
+    "        \"subclass\": [\n",
+    "            \"Financial reports and statements\",\n",
+    "            \"Business plans and proposals\",\n",
+    "            \"Market research and analysis reports\"\n",
+    "        ]\n",
+    "    },\n",
+    "    \"Advertising and Marketing Materials\": {\n",
+    "        \"type\": \"TEXT\",\n",
+    "        \"subclass\": [\n",
+    "            \"Ad copies and marketing slogans\",\n",
+    "            \"Product catalogs and brochures\",\n",
+    "            \"Press releases and promotional content\"\n",
+    "        ]\n",
+    "    },\n",
+    "    \"Emails and Correspondence\": {\n",
+    "        \"type\": \"TEXT\",\n",
+    "        \"subclass\": [\n",
+    "            \"Professional and formal correspondence\",\n",
+    "            \"Personal emails and letters\"\n",
+    "        ]\n",
+    "    },\n",
+    "    \"Metadata and Annotations\": {\n",
+    "        \"type\": \"TEXT\",\n",
+    "        \"subclass\": [\n",
+    "            \"Image and video captions\",\n",
+    "            \"Annotations and metadata for various media\"\n",
+    "        ]\n",
+    "    },\n",
+    "    \"Language Learning Materials\": {\n",
+    "        \"type\": \"TEXT\",\n",
+    "        \"subclass\": [\n",
+    "            \"Vocabulary lists and grammar rules\",\n",
+    "            \"Language exercises and quizzes\"\n",
+    "        ]\n",
+    "    },\n",
+    "    \"Audio Content\": {\n",
+    "    \"type\": \"AUDIO\",\n",
+    "    \"subclass\": [\n",
+    "        \"Music tracks and albums\",\n",
+    "        \"Podcasts and radio broadcasts\",\n",
+    "        \"Audiobooks and audio guides\",\n",
+    "        \"Recorded interviews and speeches\",\n",
+    "        \"Sound effects and ambient sounds\"\n",
+    "    ]\n",
+    "    },\n",
+    "    \"Image Content\": {\n",
+    "        \"type\": \"IMAGE\",\n",
+    "        \"subclass\": [\n",
+    "            \"Photographs and digital images\",\n",
+    "            \"Illustrations, diagrams, and charts\",\n",
+    "            \"Infographics and visual data representations\",\n",
+    "            \"Artwork and paintings\",\n",
+    "            \"Screenshots and graphical user interfaces\"\n",
+    "        ]\n",
+    "    },\n",
+    "    \"Video Content\": {\n",
+    "        \"type\": \"VIDEO\",\n",
+    "        \"subclass\": [\n",
+    "            \"Movies and short films\",\n",
+    "            \"Documentaries and educational videos\",\n",
+    "            \"Video tutorials and how-to guides\",\n",
+    "            \"Animated features and cartoons\",\n",
+    "            \"Live event recordings and sports broadcasts\"\n",
+    "        ]\n",
+    "    },\n",
+    "    \"Multimedia Content\": {\n",
+    "        \"type\": \"MULTIMEDIA\",\n",
+    "        \"subclass\": [\n",
+    "            \"Interactive web content and games\",\n",
+    "            \"Virtual reality (VR) and augmented reality (AR) experiences\",\n",
+    "            \"Mixed media presentations and slide decks\",\n",
+    "            \"E-learning modules with integrated multimedia\",\n",
+    "            \"Digital exhibitions and virtual tours\"\n",
+    "        ]\n",
+    "    },\n",
+    "    \"3D Models and CAD Content\": {\n",
+    "        \"type\": \"3D_MODEL\",\n",
+    "        \"subclass\": [\n",
+    "            \"Architectural renderings and building plans\",\n",
+    "            \"Product design models and prototypes\",\n",
+    "            \"3D animations and character models\",\n",
+    "            \"Scientific simulations and visualizations\",\n",
+    "            \"Virtual objects for AR/VR environments\"\n",
+    "        ]\n",
+    "    },\n",
+    "    \"Procedural Content\": {\n",
+    "        \"type\": \"PROCEDURAL\",\n",
+    "        \"subclass\": [\n",
+    "            \"Tutorials and step-by-step guides\",\n",
+    "            \"Workflow and process descriptions\",\n",
+    "            \"Simulation and training exercises\",\n",
+    "            \"Recipes and crafting instructions\"\n",
+    "        ]\n",
+    "    }\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "89a3f3a0",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/borisarzentar/Projects/Topoteretes/cognee/.venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n",
+      "\u001b[32m2024-03-02 11:50:12.400\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mfastembed.embedding\u001b[0m:\u001b[36m<module>\u001b[0m:\u001b[36m7\u001b[0m - \u001b[33m\u001b[1mDefaultEmbedding, FlagEmbedding, JinaEmbedding are deprecated.Use from fastembed import TextEmbedding instead.\u001b[0m\n"
+     ]
+    }
+   ],
+   "source": [
+    "import uuid\n",
+    "from os import path\n",
+    "from qdrant_client.models import PointStruct, Distance\n",
+    "from cognitive_architecture.root_dir import ROOT_DIR\n",
+    "from fastembed import TextEmbedding\n",
+    "# from cognitive_architecture.openai_tools import get_embedding_with_backoff\n",
+    "from cognitive_architecture.infrastructure.databases.vector import QDrantAdapter, CollectionConfig"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "659e327e",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "True\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Fetching 9 files:   0%|          | 0/9 [00:00<?, ?it/s]\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "model_quantized.onnx: 100%|██████████| 279M/279M [08:28<00:00, 548kB/s]\n",
+      "\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "\u001b[A\n",
+      "model.onnx: 100%|██████████| 1.11G/1.11G [24:44<00:00, 748kB/s]\n",
+      "Fetching 9 files: 100%|██████████| 9/9 [24:46<00:00, 165.16s/it]\n"
+     ]
+    },
+    {
+     "ename": "ValidationError",
+     "evalue": "2 validation errors for PointStruct\nvector.list[float]\n  Input should be a valid list [type=list_type, input_value=None, input_type=NoneType]\n    For further information visit https://errors.pydantic.dev/2.6/v/list_type\nvector.dict[str,union[SparseVector,list[float]]]\n  Input should be a valid dictionary [type=dict_type, input_value=None, input_type=NoneType]\n    For further information visit https://errors.pydantic.dev/2.6/v/dict_type",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mValidationError\u001b[0m                           Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[5], line 39\u001b[0m\n\u001b[1;32m     36\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m qdrant_client\u001b[38;5;241m.\u001b[39mcreate_data_points(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtest_collection\u001b[39m\u001b[38;5;124m\"\u001b[39m, data_points)\n\u001b[1;32m     37\u001b[0m     \u001b[38;5;28mprint\u001b[39m(result)\n\u001b[0;32m---> 39\u001b[0m \u001b[38;5;28;01mawait\u001b[39;00m run()\n",
+      "Cell \u001b[0;32mIn[5], line 36\u001b[0m, in \u001b[0;36mrun\u001b[0;34m()\u001b[0m\n\u001b[1;32m     28\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m qdrant_client\u001b[38;5;241m.\u001b[39mcreate_collection(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtest_collection\u001b[39m\u001b[38;5;124m\"\u001b[39m, CollectionConfig(\n\u001b[1;32m     29\u001b[0m     vector_config \u001b[38;5;241m=\u001b[39m {\n\u001b[1;32m     30\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msize\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;241m1536\u001b[39m,\n\u001b[1;32m     31\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdistance\u001b[39m\u001b[38;5;124m\"\u001b[39m: Distance\u001b[38;5;241m.\u001b[39mDOT\n\u001b[1;32m     32\u001b[0m     }\n\u001b[1;32m     33\u001b[0m ))\n\u001b[1;32m     34\u001b[0m \u001b[38;5;28mprint\u001b[39m(result)\n\u001b[0;32m---> 36\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m qdrant_client\u001b[38;5;241m.\u001b[39mcreate_data_points(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtest_collection\u001b[39m\u001b[38;5;124m\"\u001b[39m, data_points)\n\u001b[1;32m     37\u001b[0m \u001b[38;5;28mprint\u001b[39m(result)\n",
+      "File \u001b[0;32m~/Projects/Topoteretes/cognee/cognitive_architecture/infrastructure/databases/vector/qdrant/QDrantAdapter.py:58\u001b[0m, in \u001b[0;36mQDrantAdapter.create_data_points\u001b[0;34m(self, collection_name, data_points)\u001b[0m\n\u001b[1;32m     55\u001b[0m \u001b[38;5;28;01masync\u001b[39;00m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mcreate_data_points\u001b[39m(\u001b[38;5;28mself\u001b[39m, collection_name: \u001b[38;5;28mstr\u001b[39m, data_points: List[\u001b[38;5;28many\u001b[39m]):\n\u001b[1;32m     56\u001b[0m     client \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mget_qdrant_client()\n\u001b[0;32m---> 58\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mawait\u001b[39;00m client\u001b[38;5;241m.\u001b[39mupload_points(\n\u001b[1;32m     59\u001b[0m         collection_name \u001b[38;5;241m=\u001b[39m collection_name,\n\u001b[1;32m     60\u001b[0m         points \u001b[38;5;241m=\u001b[39m data_points\n\u001b[1;32m     61\u001b[0m     )\n",
+      "File \u001b[0;32m~/Projects/Topoteretes/cognee/.venv/lib/python3.12/site-packages/qdrant_client/async_qdrant_client.py:1800\u001b[0m, in \u001b[0;36mAsyncQdrantClient.upload_points\u001b[0;34m(self, collection_name, points, batch_size, parallel, method, max_retries, wait, shard_key_selector, **kwargs)\u001b[0m\n\u001b[1;32m   1776\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Upload points to the collection\u001b[39;00m\n\u001b[1;32m   1777\u001b[0m \n\u001b[1;32m   1778\u001b[0m \u001b[38;5;124;03mSimilar to `upload_collection` method, but operates with points, rather than vector and payload individually.\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m   1797\u001b[0m \n\u001b[1;32m   1798\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m   1799\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(kwargs) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m, \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mUnknown arguments: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlist\u001b[39m(kwargs\u001b[38;5;241m.\u001b[39mkeys())\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m-> 1800\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mawait\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_client\u001b[38;5;241m.\u001b[39mupload_points(\n\u001b[1;32m   1801\u001b[0m     collection_name\u001b[38;5;241m=\u001b[39mcollection_name,\n\u001b[1;32m   1802\u001b[0m     points\u001b[38;5;241m=\u001b[39mpoints,\n\u001b[1;32m   1803\u001b[0m     batch_size\u001b[38;5;241m=\u001b[39mbatch_size,\n\u001b[1;32m   1804\u001b[0m     parallel\u001b[38;5;241m=\u001b[39mparallel,\n\u001b[1;32m   1805\u001b[0m     method\u001b[38;5;241m=\u001b[39mmethod,\n\u001b[1;32m   1806\u001b[0m     max_retries\u001b[38;5;241m=\u001b[39mmax_retries,\n\u001b[1;32m   1807\u001b[0m     wait\u001b[38;5;241m=\u001b[39mwait,\n\u001b[1;32m   1808\u001b[0m     shard_key_selector\u001b[38;5;241m=\u001b[39mshard_key_selector,\n\u001b[1;32m   1809\u001b[0m )\n",
+      "File \u001b[0;32m~/Projects/Topoteretes/cognee/.venv/lib/python3.12/site-packages/qdrant_client/local/async_qdrant_local.py:661\u001b[0m, in \u001b[0;36mAsyncQdrantLocal.upload_points\u001b[0;34m(self, collection_name, points, **kwargs)\u001b[0m\n\u001b[1;32m    658\u001b[0m \u001b[38;5;28;01masync\u001b[39;00m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mupload_points\u001b[39m(\n\u001b[1;32m    659\u001b[0m     \u001b[38;5;28mself\u001b[39m, collection_name: \u001b[38;5;28mstr\u001b[39m, points: Iterable[types\u001b[38;5;241m.\u001b[39mPointStruct], \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs: Any\n\u001b[1;32m    660\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 661\u001b[0m     \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_upload_points\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcollection_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpoints\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/Projects/Topoteretes/cognee/.venv/lib/python3.12/site-packages/qdrant_client/local/async_qdrant_local.py:673\u001b[0m, in \u001b[0;36mAsyncQdrantLocal._upload_points\u001b[0;34m(self, collection_name, points)\u001b[0m\n\u001b[1;32m    668\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_upload_points\u001b[39m(\n\u001b[1;32m    669\u001b[0m     \u001b[38;5;28mself\u001b[39m, collection_name: \u001b[38;5;28mstr\u001b[39m, points: Iterable[Union[types\u001b[38;5;241m.\u001b[39mPointStruct, types\u001b[38;5;241m.\u001b[39mRecord]]\n\u001b[1;32m    670\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m    671\u001b[0m     collection \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_get_collection(collection_name)\n\u001b[1;32m    672\u001b[0m     collection\u001b[38;5;241m.\u001b[39mupsert(\n\u001b[0;32m--> 673\u001b[0m         \u001b[43m[\u001b[49m\n\u001b[1;32m    674\u001b[0m \u001b[43m            \u001b[49m\u001b[43mrest_models\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mPointStruct\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    675\u001b[0m \u001b[43m                \u001b[49m\u001b[38;5;28;43mid\u001b[39;49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpoint\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mid\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mvector\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpoint\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvector\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43m{\u001b[49m\u001b[43m}\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpayload\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpoint\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpayload\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43m{\u001b[49m\u001b[43m}\u001b[49m\n\u001b[1;32m    676\u001b[0m \u001b[43m            \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    677\u001b[0m \u001b[43m            \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mpoint\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mpoints\u001b[49m\n\u001b[1;32m    678\u001b[0m \u001b[43m        \u001b[49m\u001b[43m]\u001b[49m\n\u001b[1;32m    679\u001b[0m     )\n",
+      "Cell \u001b[0;32mIn[5], line 17\u001b[0m, in \u001b[0;36mcreate_data_point\u001b[0;34m(data)\u001b[0m\n\u001b[1;32m     16\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mcreate_data_point\u001b[39m(data: \u001b[38;5;28mstr\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m PointStruct:\n\u001b[0;32m---> 17\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mPointStruct\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m     18\u001b[0m \u001b[43m        \u001b[49m\u001b[38;5;28;43mid\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;28;43mstr\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43muuid\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43muuid4\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     19\u001b[0m \u001b[43m        \u001b[49m\u001b[43mvector\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m \u001b[49m\u001b[43membed_data\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdata\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     20\u001b[0m \u001b[43m        \u001b[49m\u001b[43mpayload\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m \u001b[49m\u001b[43m{\u001b[49m\n\u001b[1;32m     21\u001b[0m \u001b[43m            \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mraw\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mdata\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     22\u001b[0m \u001b[43m        \u001b[49m\u001b[43m}\u001b[49m\n\u001b[1;32m     23\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/Projects/Topoteretes/cognee/.venv/lib/python3.12/site-packages/pydantic/main.py:171\u001b[0m, in \u001b[0;36mBaseModel.__init__\u001b[0;34m(self, **data)\u001b[0m\n\u001b[1;32m    169\u001b[0m \u001b[38;5;66;03m# `__tracebackhide__` tells pytest and some other tools to omit this function from tracebacks\u001b[39;00m\n\u001b[1;32m    170\u001b[0m __tracebackhide__ \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[0;32m--> 171\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m__pydantic_validator__\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvalidate_python\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdata\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mself_instance\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m)\u001b[49m\n",
+      "\u001b[0;31mValidationError\u001b[0m: 2 validation errors for PointStruct\nvector.list[float]\n  Input should be a valid list [type=list_type, input_value=None, input_type=NoneType]\n    For further information visit https://errors.pydantic.dev/2.6/v/list_type\nvector.dict[str,union[SparseVector,list[float]]]\n  Input should be a valid dictionary [type=dict_type, input_value=None, input_type=NoneType]\n    For further information visit https://errors.pydantic.dev/2.6/v/dict_type"
+     ]
+    }
+   ],
+   "source": [
+    "database_path = path.join(path.abspath(ROOT_DIR), \"database/data\", \"vector_retrieval_demo.db\")\n",
+    "\n",
+    "try:\n",
+    "    import shutil\n",
+    "    shutil.rmtree(database_path)\n",
+    "except Exception as exception:\n",
+    "    print(exception)\n",
+    "    pass\n",
+    "\n",
+    "def embed_data(data: str):\n",
+    "    embedding_engine = TextEmbedding(model_name = \"sentence-transformers/paraphrase-multilingual-mpnet-base-v2\")\n",
+    "    embedding_engine.embed(documents = [data])\n",
+    "\n",
+    "qdrant_client = QDrantAdapter(qdrant_path = database_path, qdrant_url = None, qdrant_api_key = None)\n",
+    "\n",
+    "def create_data_point(data: str) -> PointStruct:\n",
+    "    return PointStruct(\n",
+    "        id = str(uuid.uuid4()),\n",
+    "        vector = embed_data(data),\n",
+    "        payload = {\n",
+    "            \"raw\": data,\n",
+    "        }\n",
+    "    )\n",
+    "\n",
+    "data_points = map(create_data_point, articles)\n",
+    "\n",
+    "async def run():\n",
+    "    result = await qdrant_client.create_collection(\"test_collection\", CollectionConfig(\n",
+    "        vector_config = {\n",
+    "            \"size\": 1536,\n",
+    "            \"distance\": Distance.DOT\n",
+    "        }\n",
+    "    ))\n",
+    "    print(result)\n",
+    "\n",
+    "    result = await qdrant_client.create_data_points(\"test_collection\", data_points)\n",
+    "    print(result)\n",
+    "\n",
+    "await run()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4b6163a2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "qdrant_client = QDrantAdapter(qdrant_path = database_path, qdrant_url = None, qdrant_api_key = None)\n",
+    "\n",
+    "query_vector = embed_data(\"Last emperor of India\")\n",
+    "\n",
+    "results = await qdrant_client.find_related_data_points(collection_name = \"test_collection\", query_vector = query_vector)\n",
+    "\n",
+    "import json\n",
+    "\n",
+    "for result in results:\n",
+    "    print(result.score)\n",
+    "\n",
+    "results[0].payload[\"raw\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3d18b00f",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
				`@ -0,0 +1 @@`
				`from .InfrastructureConfig import infrastructure_config`