feat: save files metadata in duckdb

This commit is contained in:
Boris Arzentar 2024-03-12 13:42:51 +01:00
parent 279c7a0789
commit 8cbf488e59
75 changed files with 5096 additions and 180 deletions

BIN
.DS_Store vendored Normal file

Binary file not shown.

6
.dlt/config.toml Normal file
View file

@ -0,0 +1,6 @@
# put your configuration values here
[runtime]
log_level="WARNING" # the system log level of dlt
# use the dlthub_telemetry setting to enable/disable anonymous usage data reporting, see https://dlthub.com/docs/telemetry
dlthub_telemetry = false

5
.dlt/secrets.toml Normal file
View file

@ -0,0 +1,5 @@
# put your secret values and credentials here. do not share this file and do not push it to github
[destination.qdrant.credentials]
location = "https://cff4594b-c2de-4fcf-8365-4c1d3a1c1429.us-east4-0.gcp.cloud.qdrant.io:6333"
api_key = "K5BKjVGR8Qn4pVMk9nPFNTqITu3QVGR1O8qlDDH6kk52HUwB4lRjjw"

1
.python-version Normal file
View file

@ -0,0 +1 @@
3.10.13

156
cognee.add.ipynb Normal file
View file

@ -0,0 +1,156 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "823c799a",
"metadata": {},
"outputs": [],
"source": [
"from os import listdir, path\n",
"from uuid import uuid5, UUID\n",
"from cognitive_architecture import add\n",
"\n",
"data_path = path.abspath(\".data\")\n",
"pdf_files = [file for file in listdir(data_path) if path.isfile(path.join(data_path, file))]\n",
"\n",
"await add(\n",
" list(map(\n",
" lambda file_path: f\"file://{path.join(data_path, file_path)}\",\n",
" pdf_files\n",
" ))[:3],\n",
" uuid5(UUID(\"00000000-0000-0000-0000-000000000000\"), \"pdf_files_cognee\"),\n",
" \"test-dataset\"\n",
")\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "c4d5a399",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Pipeline file_load_from_filesystem load step completed in 0.39 seconds\n",
"1 load package(s) were loaded to destination duckdb and into dataset pravilnik_energetska_efikasnost\n",
"The duckdb destination used duckdb:///:external: location to store data\n",
"Load package 1710243999.1369941 is LOADED and contains no failed jobs\n",
"Pipeline file_load_from_filesystem load step completed in 0.40 seconds\n",
"1 load package(s) were loaded to destination duckdb and into dataset pravilnik_energetska_efikasnost_sertifikati\n",
"The duckdb destination used duckdb:///:external: location to store data\n",
"Load package 1710246971.055336 is LOADED and contains no failed jobs\n"
]
}
],
"source": [
"from os import listdir, path\n",
"from uuid import uuid5, UUID\n",
"from cognitive_architecture import add_dlt\n",
"\n",
"data_path = path.abspath(\".data\")\n",
"# pdf_files = [file for file in listdir(data_path) if path.isfile(path.join(data_path, file))]\n",
"\n",
"# await add_dlt(\n",
"# list(map(\n",
"# lambda file_path: f\"file://{path.join(data_path, file_path)}\",\n",
"# pdf_files\n",
"# ))[:5],\n",
"# \"pdf_files\"\n",
"# )\n",
"\n",
"results = await add_dlt(data_path, \"pravilnik.energetska_efikasnost\")\n",
"for result in results:\n",
" print(result)\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "47edce91",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"/Users/borisarzentar/Projects/Topoteretes/cognee/cognitive_architecture/data/cognee/cognee.duckdb\n",
"┌──────────────────────┬──────────────────────┬──────────────────────┬───┬───────────────────┬────────────────┐\n",
"│ id │ name │ file_path │ … │ _dlt_load_id │ _dlt_id │\n",
"│ varchar │ varchar │ varchar │ │ varchar │ varchar │\n",
"├──────────────────────┼──────────────────────┼──────────────────────┼───┼───────────────────┼────────────────┤\n",
"│ 881ecb36-2819-54c3… │ Izmenjen-clan-17-P… │ /Users/borisarzent… │ … │ 1710242259.670676 │ /LPIFEK4ayoivQ │\n",
"├──────────────────────┴──────────────────────┴──────────────────────┴───┴───────────────────┴────────────────┤\n",
"│ 1 rows 8 columns (5 shown) │\n",
"└─────────────────────────────────────────────────────────────────────────────────────────────────────────────┘\n",
"\n",
"┌──────────────────────┬──────────────────────┬──────────────────────┬───┬────────────────────┬────────────────┐\n",
"│ id │ name │ file_path │ … │ _dlt_load_id │ _dlt_id │\n",
"│ varchar │ varchar │ varchar │ │ varchar │ varchar │\n",
"├──────────────────────┼──────────────────────┼──────────────────────┼───┼────────────────────┼────────────────┤\n",
"│ cd1dc11f-397b-5048… │ Pravilnik-o-sadrzi… │ /Users/borisarzent… │ … │ 1710243553.0357041 │ TrxbqUr6PepbTA │\n",
"│ 320cee87-d02e-540d… │ Pravilnik-o-nacinu… │ /Users/borisarzent… │ … │ 1710243553.0357041 │ njwnSKr24K1vEQ │\n",
"│ bfe85a36-1427-555d… │ Pravilnik-o-izgled… │ /Users/borisarzent… │ … │ 1710243553.0357041 │ hYgSJPKZJoQNHQ │\n",
"│ 5767b799-9815-5834… │ Pravilnik-o-postup… │ /Users/borisarzent… │ … │ 1710243553.0357041 │ CAR2xYK53eR9Wg │\n",
"│ 9133b38e-d2aa-5916… │ Pravilnik-o-uslovi… │ /Users/borisarzent… │ … │ 1710243553.0357041 │ aiDe0Ggk5JMN/w │\n",
"│ bff79816-8610-5dfa… │ Pravilnik-o-sadrzi… │ /Users/borisarzent… │ … │ 1710243553.0357041 │ Fi2fQVYOI1lZ8w │\n",
"│ d3fbcf40-abcc-56d4… │ Pravilnik-o-uslovi… │ /Users/borisarzent… │ … │ 1710243553.0357041 │ aw1WBpNI62y+Kg │\n",
"│ 826bbd41-e322-5587… │ Pravilnik-o-katast… │ /Users/borisarzent… │ … │ 1710243553.0357041 │ S5QOxjEv51lBBw │\n",
"│ f354abe5-bc7e-520f… │ Pravilnik-o-objekt… │ /Users/borisarzent… │ … │ 1710243553.0357041 │ B5CinyB0UGlbng │\n",
"│ 1e47801b-2a4f-57cf… │ Pravilnik-o-sadrzi… │ /Users/borisarzent… │ … │ 1710243553.0357041 │ w6CcdYAB8ie+xw │\n",
"├──────────────────────┴──────────────────────┴──────────────────────┴───┴────────────────────┴────────────────┤\n",
"│ 10 rows 8 columns (5 shown) │\n",
"└──────────────────────────────────────────────────────────────────────────────────────────────────────────────┘\n",
"\n"
]
}
],
"source": [
"import duckdb\n",
"from cognitive_architecture.root_dir import get_absolute_path\n",
"\n",
"dataset_name = \"pdf_files\"\n",
"\n",
"db_path = get_absolute_path(\"./data/cognee\")\n",
"db_location = db_path + \"/cognee.duckdb\"\n",
"print(db_location)\n",
"\n",
"db = duckdb.connect(db_location)\n",
"\n",
"izmene = db.sql(f\"SELECT * FROM izmene.file_metadata;\")\n",
"\n",
"print(izmene)\n",
"\n",
"pravilnik = db.sql(f\"SELECT * FROM pravilnik.file_metadata;\")\n",
"\n",
"print(pravilnik)\n",
"\n",
"\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

BIN
cognitive_architecture/.DS_Store vendored Normal file

Binary file not shown.

View file

@ -0,0 +1,2 @@
from .api.v1.add.add import add
from .api.v1.add.add_dlt import add_dlt

View file

@ -0,0 +1 @@
from .add import add

View file

@ -0,0 +1,53 @@
import asyncio
from uuid import UUID, uuid4
from typing import Union, BinaryIO, List
import cognitive_architecture.modules.ingestion as ingestion
from cognitive_architecture.infrastructure import infrastructure_config
class DatasetException(Exception):
message: str
def __init__(self, message: str):
self.message = message
async def add(
data: Union[str, BinaryIO, List[Union[str, BinaryIO]]],
dataset_id: UUID = uuid4(),
dataset_name: str = None
):
db_engine = infrastructure_config.get_config()["database_engine"]
if db_engine.is_db_done is not True:
await db_engine.ensure_tables()
if not data:
raise DatasetException("Data must be provided to cognee.add(data: str)")
if isinstance(data, list):
promises = []
for data_item in data:
promises.append(add(data_item, dataset_id, dataset_name))
results = await asyncio.gather(*promises)
return results
if is_data_path(data):
with open(data.replace("file://", ""), "rb") as file:
return await add(file, dataset_id, dataset_name)
classified_data = ingestion.classify(data)
data_id = ingestion.identify(classified_data)
await ingestion.save(dataset_id, dataset_name, data_id, classified_data)
return dataset_id
# await ingestion.vectorize(dataset_id, dataset_name, data_id, classified_data)
def is_data_path(data: str) -> bool:
return False if not isinstance(data, str) else data.startswith("file://")

View file

@ -0,0 +1,91 @@
from typing import List, Union
from os import path, listdir
import asyncio
import dlt
import duckdb
from unstructured.cleaners.core import clean
from cognitive_architecture.root_dir import get_absolute_path
import cognitive_architecture.modules.ingestion as ingestion
from cognitive_architecture.infrastructure.files import get_file_metadata
from cognitive_architecture.infrastructure.files.storage import LocalStorage
async def add_dlt(file_paths: Union[str, List[str]], dataset_name: str = None):
if isinstance(file_paths, str):
# Directory path provided, we need to extract the file paths and dataset name
def list_dir_files(root_dir_path: str, parent_dir: str = "root"):
datasets = {}
for file_or_dir in listdir(root_dir_path):
if path.isdir(path.join(root_dir_path, file_or_dir)):
dataset_name = file_or_dir if parent_dir == "root" else parent_dir + "." + file_or_dir
dataset_name = clean(dataset_name.replace(" ", "_"))
nested_datasets = list_dir_files(path.join(root_dir_path, file_or_dir), dataset_name)
for dataset in nested_datasets:
datasets[dataset] = nested_datasets[dataset]
else:
if parent_dir not in datasets:
datasets[parent_dir] = []
datasets[parent_dir].append(path.join(root_dir_path, file_or_dir))
return datasets
datasets = list_dir_files(file_paths)
results = []
for key in datasets:
if dataset_name is not None and not key.startswith(dataset_name):
continue
results.append(add_dlt(datasets[key], dataset_name = key))
return await asyncio.gather(*results)
db_path = get_absolute_path("./data/cognee")
db_location = f"{db_path}/cognee.duckdb"
LocalStorage.ensure_directory_exists(db_path)
db = duckdb.connect(db_location)
destination = dlt.destinations.duckdb(
credentials = db,
)
pipeline = dlt.pipeline(
pipeline_name = "file_load_from_filesystem",
destination = destination,
)
@dlt.resource(standalone = True, merge_key = "id")
def data_resources(file_paths: str):
for file_path in file_paths:
with open(file_path.replace("file://", ""), mode = "rb") as file:
classified_data = ingestion.classify(file)
data_id = ingestion.identify(classified_data)
file_metadata = get_file_metadata(classified_data.get_data())
yield {
"id": data_id,
"name": file_metadata["name"],
"file_path": file_metadata["file_path"],
"extension": file_metadata["extension"],
"mime_type": file_metadata["mime_type"],
"keywords": "|".join(file_metadata["keywords"]),
}
run_info = pipeline.run(
data_resources(file_paths),
table_name = "file_metadata",
dataset_name = dataset_name,
write_disposition = "merge",
)
return run_info

View file

@ -1,32 +0,0 @@
from enum import Enum
from qdrant_client.models import Distance, VectorParams
from cognitive_architecture.modules.memory.vector import create_vector_memory
from cognitive_architecture.modules.users.memory import is_existing_memory, register_memory_for_user
from cognitive_architecture.infrastructure.databases.vector.qdrant.adapter import CollectionConfig
class MemoryType(Enum):
GRAPH = "GRAPH"
VECTOR = "VECTOR"
RELATIONAL = "RELATIONAL"
class MemoryException(Exception):
message: str
def __init__(self, message: str):
self.message = message
async def create_memory(user_id: str, memory_name: str, memory_type: MemoryType):
if await is_existing_memory(memory_name):
raise MemoryException(f'Memory with the name "{memory_name}" already exists. Memory names must be unique.')
match memory_type:
case MemoryType.VECTOR:
await create_vector_memory(memory_name, CollectionConfig(
vector_config = VectorParams(
size = 1536,
distance = Distance.DOT,
)
))
await register_memory_for_user(user_id, memory_name)

View file

@ -27,7 +27,7 @@ class Config:
)
)
db_path = Path(__file__).resolve().parent / "database/data"
db_path = str(Path(__file__).resolve().parent / "data/system")
vectordb: str = os.getenv("VECTORDB", "weaviate")
qdrant_path: str = os.getenv("QDRANT_PATH")

Binary file not shown.

Binary file not shown.

View file

@ -1,13 +1,11 @@
"""This module provides functionalities for creating and managing databases."""
import os
from contextlib import asynccontextmanager
from sqlalchemy import create_engine, text
from sqlalchemy import text
from sqlalchemy.ext.asyncio import create_async_engine
from config import Config
from database.relationaldb.database import Base, get_sqlalchemy_database_url
from database.relationaldb.models import memory, metadatas, operation, sessions, user, docs
from cognitive_architecture.config import Config
from cognitive_architecture.database.relationaldb.database import Base, get_sqlalchemy_database_url
globalConfig = Config()
@ -58,10 +56,8 @@ class DatabaseManager:
async def create_tables(self):
"""Create tables based on the SQLAlchemy Base metadata."""
try:
async with self.engine.begin() as conn:
await conn.run_sync(Base.metadata.create_all)
async with self.engine.begin() as conn:
await conn.run_sync(Base.metadata.create_all)
except Exception as e:
print(e)
raise e

View file

@ -0,0 +1,21 @@
from cognitive_architecture.config import Config
from .databases.relational import SqliteEngine, DatabaseEngine
config = Config()
config.load()
class InfrastructureConfig():
database_engine: DatabaseEngine = None
def get_config(self) -> dict:
if self.database_engine is None:
self.database_engine = SqliteEngine(config.db_path, config.db_name)
return {
"database_engine": self.database_engine
}
def set_config(self, new_config: dict):
self.database_engine = new_config["database_engine"]
infrastructure_config = InfrastructureConfig()

View file

@ -0,0 +1 @@
from .InfrastructureConfig import infrastructure_config

View file

@ -0,0 +1,4 @@
from .models.Data import Data
from .models.Dataset import Dataset
from .models.DatasetData import DatasetData
from .add_data_to_dataset import add_data_to_dataset

View file

@ -0,0 +1,45 @@
import logging
from . import Dataset, Data
from cognitive_architecture.infrastructure import infrastructure_config
from cognitive_architecture.infrastructure.databases.relational import DatabaseEngine
from cognitive_architecture.infrastructure.files import remove_file_from_storage
logger = logging.getLogger(__name__)
async def add_data_to_dataset(dataset: Dataset, data: Data):
db_engine: DatabaseEngine = infrastructure_config.get_config()["database_engine"]
existing_dataset = (await db_engine.query_entity(dataset)).scalar()
existing_data = (await db_engine.query_entity(data)).scalar()
if existing_dataset:
if existing_data:
await remove_old_raw_data(existing_data.raw_data_location)
def update_raw_data():
existing_data.raw_data_location = data.raw_data_location
await db_engine.update(update_raw_data)
if existing_dataset.id == dataset.id and dataset.name is not None:
def update_name(): existing_dataset.name = dataset.name
await db_engine.update(update_name)
else:
await db_engine.update(lambda: existing_dataset.data.append(data))
else:
if existing_data:
await remove_old_raw_data(existing_data.raw_data_location)
existing_data.raw_data_location = data.raw_data_location
await db_engine.update(lambda: dataset.data.append(existing_data))
else:
await db_engine.update(lambda: dataset.data.append(data))
await db_engine.create(dataset)
async def remove_old_raw_data(data_location: str):
try:
await remove_file_from_storage(data_location)
except Exception:
logger.error("Failed to remove old raw data file: %s", data_location)

View file

@ -0,0 +1,23 @@
from typing import List
from datetime import datetime, timezone
from sqlalchemy.orm import relationship, MappedColumn, Mapped
from sqlalchemy import Column, String, DateTime, UUID, Text, JSON
from cognitive_architecture.infrastructure.databases.relational import ModelBase
from .DatasetData import DatasetData
class Data(ModelBase):
__tablename__ = "data"
id = Column(UUID, primary_key = True)
name = Column(String, nullable = True)
description = Column(Text, nullable = True)
raw_data_location = Column(String)
meta_data: Mapped[dict] = MappedColumn(type_ = JSON) # metadata is reserved word
created_at = Column(DateTime, default = datetime.now(timezone.utc))
updated_at = Column(DateTime, onupdate = datetime.now(timezone.utc))
datasets: Mapped[List["Dataset"]] = relationship(
secondary = DatasetData.__tablename__,
back_populates = "data"
)

View file

@ -0,0 +1,21 @@
from typing import List
from datetime import datetime, timezone
from sqlalchemy.orm import relationship, Mapped
from sqlalchemy import Column, Text, DateTime, UUID
from cognitive_architecture.infrastructure.databases.relational import ModelBase
from .DatasetData import DatasetData
class Dataset(ModelBase):
__tablename__ = "dataset"
id = Column(UUID, primary_key = True)
name = Column(Text)
description = Column(Text, nullable = True)
created_at = Column(DateTime, default = datetime.now(timezone.utc))
updated_at = Column(DateTime, onupdate = datetime.now(timezone.utc))
data: Mapped[List["Data"]] = relationship(
secondary = DatasetData.__tablename__,
back_populates = "datasets"
)

View file

@ -0,0 +1,14 @@
from uuid import uuid4
from datetime import datetime, timezone
from sqlalchemy import Column, DateTime, UUID, ForeignKey
from cognitive_architecture.infrastructure.databases.relational import ModelBase
class DatasetData(ModelBase):
__tablename__ = "dataset_data"
id = Column(UUID, primary_key = True, default = uuid4())
created_at = Column(DateTime, default = datetime.now(timezone.utc))
dataset_id = Column("dataset", UUID, ForeignKey("dataset.id"), primary_key = True)
data_id = Column("data", UUID, ForeignKey("data.id"), primary_key = True)

View file

@ -0,0 +1,23 @@
from typing import Protocol
class DatabaseEngine(Protocol):
async def ensure_tables(self):
pass
def database_exists(self, db_name: str) -> bool:
pass
def create_database(self, db_name: str):
pass
def drop_database(self, db_name: str):
pass
async def table_exists(self, table_name: str) -> bool:
pass
async def create_tables(self):
pass
async def create(self, data):
pass

View file

@ -0,0 +1,3 @@
from sqlalchemy.orm import declarative_base
ModelBase = declarative_base()

View file

@ -0,0 +1,3 @@
from .ModelBase import ModelBase
from .DatabaseEngine import DatabaseEngine
from .sqlite.SqliteEngine import SqliteEngine

View file

@ -1,4 +0,0 @@
from .general.adapter import RelationalDBAdapter
def get_database():
return RelationalDBAdapter()

View file

@ -0,0 +1,82 @@
import os
import asyncio
from typing import Callable
from sqlalchemy.inspection import inspect
from sqlalchemy.ext.asyncio import create_async_engine, async_sessionmaker, AsyncEngine, AsyncSession, async_scoped_session
from sqlalchemy.future import select
from cognitive_architecture.infrastructure.files.storage.LocalStorage import LocalStorage
from ..DatabaseEngine import DatabaseEngine
from ..ModelBase import ModelBase
from ..utils import with_rollback
class SqliteEngine(DatabaseEngine):
db_path: str = None
db_name: str = None
engine: AsyncEngine = None
session_maker: Callable[[], async_scoped_session[AsyncSession]] = None
is_db_done: bool = False
def __init__(self, db_path: str, db_name: str):
self.db_path = db_path
self.db_name = db_name
self.db_location = db_path + "/" + db_name
self.engine = create_async_engine(
f"sqlite+aiosqlite:///{self.db_location}",
pool_recycle = 3600,
echo = False
)
self.session_maker = lambda: async_scoped_session(
async_sessionmaker(
bind = self.engine,
class_ = AsyncSession
),
scopefunc = asyncio.current_task
)
async def ensure_tables(self):
if not self.database_exists(self.db_name):
self.create_database(self.db_name)
await self.create_tables()
self.is_db_done = True
return True
def database_exists(self, db_name: str) -> bool:
return os.path.exists(self.db_path + "/" + db_name)
def create_database(self, db_name: str):
LocalStorage.ensure_directory_exists(self.db_path)
with open(self.db_path + "/" + db_name, mode = "w+", encoding = "utf-8") as file:
file.write("")
def drop_database(self, db_name: str):
os.remove(self.db_location)
async def table_exists(self, table_name: str) -> bool:
return inspect(self.engine).has_table(table_name)
async def create_tables(self):
async with self.engine.begin() as connection:
return await connection.run_sync(ModelBase.metadata.create_all)
async def create(self, data):
async with with_rollback(self.session_maker()) as session:
session.add(data)
async def query(self, query_term):
async with with_rollback(self.session_maker()) as session:
return await session.execute(query_term)
async def query_entity(self, entity):
async with with_rollback(self.session_maker()) as session:
return await session.execute(
select(type(entity))
.where(type(entity).id == entity.id)
)
async def update(self, data_update_fn):
async with with_rollback(self.session_maker()):
data_update_fn()

View file

@ -0,0 +1 @@
from .with_rollback import with_rollback

View file

@ -0,0 +1,18 @@
import logging
from contextlib import asynccontextmanager
from sqlalchemy.ext.asyncio import async_scoped_session
logger = logging.getLogger(__name__)
@asynccontextmanager
async def with_rollback(session: async_scoped_session):
"""Provide a transactional scope around a series of operations."""
try:
# async with session.begin():
yield session
await session.commit()
await session.remove()
except Exception as exception:
await session.rollback()
logger.error("Session rolled back due to: %s", str(exception))
raise exception

View file

@ -0,0 +1,2 @@
from .get_vector_database import get_vector_database
from .qdrant import QDrantAdapter, CollectionConfig

View file

@ -59,3 +59,13 @@ class QDrantAdapter(VectorDBInterface):
collection_name = collection_name,
points = data_points
)
async def find_related_data_points(self, collection_name: str, query_vector):
client = self.get_qdrant_client()
return await client.search(
collection_name = collection_name,
query_vector = query_vector,
with_payload = True,
score_threshold = 0.8
)

View file

@ -1 +1 @@
from .adapter import QDrantAdapter
from .QDrantAdapter import QDrantAdapter, CollectionConfig

View file

@ -46,6 +46,13 @@ class VectorDBInterface(Protocol):
data_points: List[any]
): raise NotImplementedError
@abstractmethod
async def find_related_data_points(
self,
collection_name: str,
query_vector: any
): raise NotImplementedError
# @abstractmethod
# async def get_data_point(
# self,

View file

@ -0,0 +1,3 @@
from .add_file_to_storage import add_file_to_storage
from .remove_file_from_storage import remove_file_from_storage
from .utils.get_file_metadata import get_file_metadata, FileMetadata

View file

@ -0,0 +1,9 @@
from typing import BinaryIO
from cognitive_architecture.root_dir import get_absolute_path
from .storage.StorageManager import StorageManager
from .storage.LocalStorage import LocalStorage
async def add_file_to_storage(file_path: str, file: BinaryIO):
storage_manager = StorageManager(LocalStorage(get_absolute_path("data/files")))
storage_manager.store(file_path, file)

View file

@ -0,0 +1,8 @@
from cognitive_architecture.root_dir import get_absolute_path
from .storage.StorageManager import StorageManager
from .storage.LocalStorage import LocalStorage
async def remove_file_from_storage(file_path: str):
storage_manager = StorageManager(LocalStorage(get_absolute_path("data/files")))
storage_manager.remove(file_path)

View file

@ -0,0 +1,37 @@
import os
from typing import BinaryIO
from .StorageManager import Storage
class LocalStorage(Storage):
storage_path: str = None
def __init__(self, storage_path: str):
self.storage_path = storage_path
def store(self, file_path: str, data: BinaryIO):
full_file_path = self.storage_path + "/" + file_path
LocalStorage.ensure_directory_exists(self.storage_path)
with open(full_file_path, "wb") as f:
f.write(data.read())
def retrieve(self, file_path: str):
full_file_path = self.storage_path + "/" + file_path
with open(full_file_path, "rb") as f:
return f.read()
@staticmethod
def ensure_directory_exists(file_path: str):
if not os.path.exists(file_path):
os.makedirs(file_path)
def remove(self, file_path: str):
os.remove(self.storage_path + "/" + file_path)
# def get_directory(self, file_path: str):
# [path, __] = file_path.split(".")
# directory = "/".join(path.split("/")[:-1])
# return directory if directory != "" else None

View file

@ -0,0 +1,26 @@
from typing import Protocol, BinaryIO
class Storage(Protocol):
def store(self, file_path: str, data: bytes):
pass
def retrieve(self, file_path: str):
pass
def remove(self, file_path: str):
pass
class StorageManager():
storage: Storage = None
def __init__ (self, storage: Storage):
self.storage = storage
def store(self, file_path: str, data: BinaryIO):
return self.storage.store(file_path, data)
def retrieve(self, file_path: str):
return self.storage.retrieve(file_path)
def remove(self, file_path: str):
return self.storage.remove(file_path)

View file

@ -0,0 +1 @@
from .LocalStorage import LocalStorage

View file

@ -0,0 +1,27 @@
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
def extract_keywords(text: str) -> list[str]:
tokens = nltk.word_tokenize(text)
tags = nltk.pos_tag(tokens)
nouns = [word for (word, tag) in tags if tag == "NN"]
vectorizer = TfidfVectorizer()
tfidf = vectorizer.fit_transform(nouns)
top_nouns = sorted(
vectorizer.vocabulary_,
key = lambda x: tfidf[0, vectorizer.vocabulary_[x]],
reverse = True
)
keywords = []
for word in top_nouns:
if len(word) > 3:
keywords.append(word)
if len(keywords) >= 15:
break
return keywords

View file

@ -0,0 +1,44 @@
from typing import BinaryIO, TypedDict
import filetype
from unstructured.cleaners.core import clean
from unstructured.partition.pdf import partition_pdf
from .extract_keywords import extract_keywords
class FileTypeException(Exception):
message: str
def __init__(self, message: str):
self.message = message
class FileMetadata(TypedDict):
name: str
mime_type: str
extension: str
keywords: list[str]
def get_file_metadata(file: BinaryIO) -> FileMetadata:
file_type = filetype.guess(file)
if file_type is None:
raise FileTypeException("Unknown file detected.")
keywords: list = []
if file_type.extension == "pdf":
elements = partition_pdf(file = file, strategy = "fast")
keywords = extract_keywords(
"\n".join(map(lambda element: clean(element.text), elements))
)
file_path = file.name
file_name = file_path.split("/")[-1].split(".")[0]
return FileMetadata(
name = file_name,
file_path = file_path,
mime_type = file_type.mime,
extension = file_type.extension,
keywords = keywords
)

View file

@ -0,0 +1,4 @@
import os
def get_file_size(file_path: str):
return os.path.getsize(file_path)

View file

@ -0,0 +1,27 @@
from datetime import datetime
from sqlalchemy.orm import Mapped, MappedColumn
from sqlalchemy import Column, String, DateTime, ForeignKey, Enum, UUID, JSON
from cognitive_architecture.infrastructure.databases.relational import ModelBase
class OperationType(Enum):
MERGE_DATA = "MERGE_DATA"
APPEND_DATA = "APPEND_DATA"
class OperationStatus(Enum):
STARTED = "OPERATION_STARTED"
IN_PROGRESS = "OPERATION_IN_PROGRESS"
COMPLETE = "OPERATION_COMPLETE"
ERROR = "OPERATION_ERROR"
CANCELLED = "OPERATION_CANCELLED"
class Operation(ModelBase):
__tablename__ = "operation"
id = Column(String, primary_key = True)
status = Column(Enum(OperationStatus))
operation_type = Column(Enum(OperationType))
data_id = Column(UUID, ForeignKey("data.id"))
meta_data: Mapped[dict] = MappedColumn(type_ = JSON)
created_at = Column(DateTime, default = datetime.utcnow)

View file

@ -0,0 +1,3 @@
from .classify import classify
from .identify import identify
from .save import save

View file

@ -0,0 +1,13 @@
from io import BufferedReader
from typing import Union, BinaryIO
from .exceptions import IngestionException
from .data_types import create_text_data, create_binary_data
def classify(data: Union[str, BinaryIO]):
if isinstance(data, str):
return create_text_data(data)
if isinstance(data, BufferedReader):
return create_binary_data(data)
raise IngestionException(f"Data sent to cognee.classify(data: any) not supported: {type(data)}")

View file

@ -0,0 +1,30 @@
from typing import BinaryIO
from cognitive_architecture.infrastructure.files import get_file_metadata, FileMetadata
from .IngestionData import IngestionData
def create_binary_data(data: BinaryIO):
return BinaryData(data)
class BinaryData(IngestionData):
data: BinaryIO = None
metadata: FileMetadata = None
def __init__(self, data: BinaryIO):
self.data = data
def get_identifier(self):
self.ensure_metadata()
return self.metadata["mime_type"] + "_" + "|".join(self.metadata["keywords"])
def get_extension(self):
self.ensure_metadata()
return self.metadata["extension"]
def ensure_metadata(self):
if self.metadata is None:
self.metadata = get_file_metadata(self.data)
def get_data(self):
return self.data

View file

@ -0,0 +1,11 @@
from typing import Protocol, BinaryIO
class IngestionData(Protocol):
data: str | BinaryIO = None
metadata: dict = None
def get_data(self):
pass
def get_extension(self):
pass

View file

@ -0,0 +1,16 @@
from .IngestionData import IngestionData
def create_text_data(data: str):
return TextData(data)
class TextData(IngestionData):
data: str = None
def __init__(self, data: str):
self.data = data
def get_data(self):
return self.data
def get_chunks(self):
pass

View file

@ -0,0 +1,3 @@
from .TextData import TextData, create_text_data
from .BinaryData import BinaryData, create_binary_data
from .IngestionData import IngestionData

View file

@ -0,0 +1,6 @@
class IngestionException(Exception):
message: str
def __init__(self, message: str):
self.message = message

View file

@ -0,0 +1,9 @@
from uuid import UUID, uuid5
from .data_types import IngestionData
null_uuid: UUID = UUID("00000000-0000-0000-0000-000000000000")
def identify(data: IngestionData) -> UUID:
data_id: str = data.get_identifier()
return uuid5(null_uuid, data_id)

View file

@ -0,0 +1,29 @@
import asyncio
from uuid import UUID, uuid4
from cognitive_architecture.infrastructure.files import add_file_to_storage
from cognitive_architecture.infrastructure.data import add_data_to_dataset, Data, Dataset
from .data_types import IngestionData
async def save(dataset_id: UUID, dataset_name: str, data_id: UUID, data: IngestionData):
file_path = uuid4().hex + "." + data.get_extension()
promises = []
# promises.append(add_file_to_storage(file_path, data.get_data()))
promises.append(
add_data_to_dataset(
Dataset(
id = dataset_id,
name = dataset_name if dataset_name else dataset_id.hex
),
Data(
id = data_id,
raw_data_location = file_path,
name = data.metadata["name"],
meta_data = data.metadata
)
)
)
await asyncio.gather(*promises)

View file

@ -0,0 +1,6 @@
from os import path
ROOT_DIR = path.dirname(path.abspath(__file__))
def get_absolute_path(path_from_root: str) -> str:
return path.abspath(path.join(ROOT_DIR, path_from_root))

3783
poetry.lock generated

File diff suppressed because it is too large Load diff

View file

@ -17,7 +17,7 @@ classifiers = [
"Operating System :: Microsoft :: Windows",]
[tool.poetry.dependencies]
python = "^3.10"
python = "3.10.13"
langchain = "^0.0.338"
openai = "1.12.0"
python-dotenv = "1.0.1"
@ -40,6 +40,12 @@ pymupdf = "^1.23.25"
pandas = "^2.2.1"
greenlet = "^3.0.3"
ruff = "^0.2.2"
filetype = "^1.2.0"
unstructured = {extras = ["all-docs"], version = "^0.12.5"}
nltk = "^3.8.1"
scikit-learn = "^1.4.1.post1"
dlt = "^0.4.6"
duckdb = {version = "^0.10.0", extras = ["dlt"]}
[tool.poetry.extras]
dbt = ["dbt-core", "dbt-redshift", "dbt-bigquery", "dbt-duckdb", "dbt-snowflake", "dbt-athena-community", "dbt-databricks"]
@ -73,6 +79,7 @@ pytest = "^7.4.0"
pytest-asyncio = "^0.21.1"
coverage = "^7.3.2"
mypy = "^1.7.1"
notebook = "^7.1.1"
[tool.poetry.group.docs.dependencies]
mkdocs = "^1.4.3"

525
vector_retrieval_demo.ipynb Normal file
View file

@ -0,0 +1,525 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "50d5afda-418f-436b-b467-004863193d4a",
"metadata": {},
"outputs": [],
"source": [
"articles = [\n",
"\"\"\"Edward VII (Albert Edward; 9 November 1841 6 May 1910) was King of the United Kingdom and the British Dominions, and Emperor of India, from 22 January 1901 until his death in 1910.\n",
"The second child and eldest son of Queen Victoria and Prince Albert of Saxe-Coburg and Gotha, Edward, nicknamed \"Bertie\", was related to royalty throughout Europe. He was Prince of Wales and heir apparent to the British throne for almost 60 years. During his mother's reign, he was largely excluded from political influence and came to personify the fashionable, leisured elite. He married Princess Alexandra of Denmark in 1863, and the couple had six children. As Prince of Wales, Edward travelled throughout Britain performing ceremonial public duties and represented Britain on visits abroad. His tours of North America in 1860 and of the Indian subcontinent in 1875 proved popular successes, but despite public approval, his reputation as a playboy prince soured his relationship with his mother.\n",
"Edward inherited the throne upon his mother's death in 1901. The King played a role in the modernisation of the British Home Fleet and the reorganisation of the British Army after the Second Boer War of 18991902. He re-instituted traditional ceremonies as public displays and broadened the range of people with whom royalty socialised. He fostered good relations between Britain and other European countries, especially France, for which he was popularly called \"Peacemaker\", but his relationship with his nephew, German Emperor Wilhelm II, was poor. The Edwardian era, which covered Edward's reign and was named after him, coincided with the start of a new century and heralded significant changes in technology and society, including steam turbine propulsion and the rise of socialism. He died in 1910 in the midst of a constitutional crisis that was resolved the following year by the Parliament Act 1911, which restricted the power of the unelected House of Lords. Edward was succeeded by his only surviving son, George V.\"\"\",\n",
" \"\"\"George V (George Frederick Ernest Albert; 3 June 1865 20 January 1936) was King of the United Kingdom and the British Dominions, and Emperor of India, from 6 May 1910 until his death in 1936.\n",
"George was born during the reign of his paternal grandmother, Queen Victoria, as the second son of the Prince and Princess of Wales (later King Edward VII and Queen Alexandra). He was third in the line of succession to the British throne behind his father and his elder brother, Prince Albert Victor. From 1877 to 1892, George served in the Royal Navy, until his elder brother's unexpected death in January 1892 put him directly in line for the throne. The next year, George married his brother's fiancée, Princess Victoria Mary of Teck, and they had six children. When Queen Victoria died in 1901, George's father ascended the throne as Edward VII, and George was created Prince of Wales. He became king-emperor on his father's death in 1910.\n",
"George's reign saw the rise of socialism, communism, fascism, Irish republicanism, and the Indian independence movement, all of which radically changed the political landscape of the British Empire, which itself reached its territorial peak by the beginning of the 1920s. The Parliament Act 1911 established the supremacy of the elected British House of Commons over the unelected House of Lords. As a result of the First World War (19141918), the empires of his first cousins Nicholas II of Russia and Wilhelm II of Germany fell, while the British Empire expanded to its greatest effective extent. In 1917, George became the first monarch of the House of Windsor, which he renamed from the House of Saxe-Coburg and Gotha as a result of anti-German public sentiment. He appointed the first Labour ministry in 1924, and the 1931 Statute of Westminster recognised the Empire's Dominions as separate, independent states within the British Commonwealth of Nations.\n",
"George suffered from smoking-related health problems during his later reign. On his death in January 1936, he was succeeded by his eldest son, Edward VIII. Edward abdicated in December of that year and was succeeded by his younger brother Albert, who took the regnal name George VI.\"\"\",\n",
"\"\"\"Edward VIII (Edward Albert Christian George Andrew Patrick David; 23 June 1894 28 May 1972), later known as the Duke of Windsor, was King of the United Kingdom and the Dominions of the British Empire, and Emperor of India, from 20 January 1936 until his abdication in December of the same year.[a]\n",
"Edward was born during the reign of his great-grandmother Queen Victoria as the eldest child of the Duke and Duchess of York, later King George V and Queen Mary. He was created Prince of Wales on his 16th birthday, seven weeks after his father succeeded as king. As a young man, Edward served in the British Army during the First World War and undertook several overseas tours on behalf of his father. The Prince of Wales gained popularity due to his charm and charisma, and his fashion sense became a hallmark of the era. After the war, his conduct began to give cause for concern; he engaged in a series of sexual affairs that worried both his father and the British prime minister, Stanley Baldwin.\n",
"Upon his father's death in 1936, Edward became the second monarch of the House of Windsor. The new king showed impatience with court protocol, and caused consternation among politicians by his apparent disregard for established constitutional conventions. Only months into his reign, a constitutional crisis was caused by his proposal to marry Wallis Simpson, an American who had divorced her first husband and was seeking a divorce from her second. The prime ministers of the United Kingdom and the Dominions opposed the marriage, arguing a divorced woman with two living ex-husbands was politically and socially unacceptable as a prospective queen consort. Additionally, such a marriage would have conflicted with Edward's status as titular head of the Church of England, which, at the time, disapproved of remarriage after divorce if a former spouse was still alive. Edward knew the Baldwin government would resign if the marriage went ahead, which could have forced a general election and would have ruined his status as a politically neutral constitutional monarch. When it became apparent he could not marry Simpson and remain on the throne, he abdicated. He was succeeded by his younger brother, George VI. With a reign of 326 days, Edward was one of the shortest-reigning British monarchs to date.\n",
"After his abdication, Edward was created Duke of Windsor. He married Simpson in France on 3 June 1937, after her second divorce became final. Later that year, the couple toured Nazi Germany, which fed rumours that he was a Nazi sympathiser. During the Second World War, Edward was at first stationed with the British Military Mission to France but after the fall of France was appointed Governor of the Bahamas. After the war, Edward spent the rest of his life in France. He and Wallis remained married until his death in 1972; they had no children.\"\"\",\n",
"\"\"\"George VI (Albert Frederick Arthur George; 14 December 1895 6 February 1952) was King of the United Kingdom and the Dominions of the British Commonwealth from 11 December 1936 until his death on 6 February 1952. He was also the last Emperor of India from 1936 until the British Raj was dissolved in August 1947, and the first head of the Commonwealth following the London Declaration of 1949.\n",
"The future George VI was born during the reign of his great-grandmother Queen Victoria; he was named Albert at birth after his great-grandfather Prince Albert of Saxe-Coburg and Gotha and was known as \"Bertie\" to his family and close friends. His father ascended the throne as George V in 1910. As the second son of the king, Albert was not expected to inherit the throne. He spent his early life in the shadow of his elder brother, Edward, the heir apparent. Albert attended naval college as a teenager and served in the Royal Navy and Royal Air Force during the First World War. In 1920, he was made Duke of York. He married Lady Elizabeth Bowes-Lyon in 1923, and they had two daughters, Elizabeth and Margaret. In the mid-1920s, he engaged speech therapist Lionel Logue to treat his stutter, which he learned to manage to some degree. His elder brother ascended the throne as Edward VIII after their father died in 1936, but Edward abdicated later that year to marry the twice-divorced American socialite Wallis Simpson. As heir presumptive to Edward VIII, Albert became king, taking the regnal name George VI.\n",
"In September 1939, the British Empire and most Commonwealth countries—but not Ireland—declared war on Nazi Germany, following the invasion of Poland. War with the Kingdom of Italy and the Empire of Japan followed in 1940 and 1941, respectively. George VI was seen as sharing the hardships of the common people and his popularity soared. Buckingham Palace was bombed during the Blitz while the King and Queen were there, and his younger brother the Duke of Kent was killed on active service. George became known as a symbol of British determination to win the war. Britain and its allies were victorious in 1945, but the British Empire declined. Ireland had largely broken away, followed by the independence of India and Pakistan in 1947. George relinquished the title of Emperor of India in June 1948 and instead adopted the new title of Head of the Commonwealth. He was beset by smoking-related health problems in the later years of his reign and died at Sandringham House, aged 56, of a coronary thrombosis in 1952. He was succeeded by his elder daughter, Elizabeth II.\"\"\",\n",
"\"\"\"Elizabeth II (Elizabeth Alexandra Mary; 21 April 1926 8 September 2022) was Queen of the United Kingdom and other Commonwealth realms from 6 February 1952 until her death in 2022. She was queen regnant of 32 sovereign states over the course of her lifetime and remained the monarch of 15 realms by the time of her death. Her reign of over 70 years is the longest of any British monarch, the longest of any female monarch, and the second longest verified reign of any monarch of a sovereign state in history.\n",
"Elizabeth was born in Mayfair, London, during the reign of her paternal grandfather, King George V. She was the first child of the Duke and Duchess of York (later King George VI and Queen Elizabeth The Queen Mother). Her father acceded to the throne in 1936 upon the abdication of his brother Edward VIII, making the ten-year-old Princess Elizabeth the heir presumptive. She was educated privately at home and began to undertake public duties during the Second World War, serving in the Auxiliary Territorial Service. In November 1947, she married Philip Mountbatten, a former prince of Greece and Denmark, and their marriage lasted 73 years until his death in 2021. They had four children: Charles, Anne, Andrew, and Edward.\n",
"When her father died in February 1952, Elizabeth—then 25 years old—became queen of seven independent Commonwealth countries: the United Kingdom, Canada, Australia, New Zealand, South Africa, Pakistan, and Ceylon (known today as Sri Lanka), as well as head of the Commonwealth. Elizabeth reigned as a constitutional monarch through major political changes such as the Troubles in Northern Ireland, devolution in the United Kingdom, the decolonisation of Africa, and the United Kingdom's accession to the European Communities, as well as its subsequent withdrawal. The number of her realms varied over time as territories gained independence and some realms became republics. As queen, Elizabeth was served by more than 170 prime ministers across her realms. Her many historic visits and meetings included state visits to China in 1986, to Russia in 1994, and to the Republic of Ireland in 2011, and meetings with five popes and fourteen US presidents.\n",
"Significant events included Elizabeth's coronation in 1953 and the celebrations of her Silver, Golden, Diamond, and Platinum jubilees in 1977, 2002, 2012, and 2022, respectively. Although she faced occasional republican sentiment and media criticism of her family—particularly after the breakdowns of her children's marriages, her annus horribilis in 1992, and the death in 1997 of her former daughter-in-law Diana—support for the monarchy in the United Kingdom remained consistently high throughout her lifetime, as did her personal popularity. Elizabeth died aged 96 at Balmoral Castle in September 2022, and was succeeded by her eldest son, Charles III.\"\"\"\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "e328a903-d084-4d07-9b95-0a9196d7f719",
"metadata": {},
"outputs": [],
"source": [
"categories = {\n",
" \"Natural Language Text\": {\n",
" \"type\": \"TEXT\",\n",
" \"subclass\": [\n",
" \"Articles, essays, and reports\",\n",
" \"Books and manuscripts\",\n",
" \"News stories and blog posts\",\n",
" \"Research papers and academic publications\",\n",
" \"Social media posts and comments\",\n",
" \"Website content and product descriptions\",\n",
" \"Personal narratives and stories\"\n",
" ]\n",
" },\n",
" \"Structured Documents\": {\n",
" \"type\": \"TEXT\",\n",
" \"subclass\": [\n",
" \"Spreadsheets and tables\",\n",
" \"Forms and surveys\",\n",
" \"Databases and CSV files\"\n",
" ]\n",
" },\n",
" \"Code and Scripts\": {\n",
" \"type\": \"TEXT\",\n",
" \"subclass\": [\n",
" \"Source code in various programming languages\",\n",
" \"Shell commands and scripts\",\n",
" \"Markup languages (HTML, XML)\",\n",
" \"Stylesheets (CSS) and configuration files (YAML, JSON, INI)\"\n",
" ]\n",
" },\n",
" \"Conversational Data\": {\n",
" \"type\": \"TEXT\",\n",
" \"subclass\": [\n",
" \"Chat transcripts and messaging history\",\n",
" \"Customer service logs and interactions\",\n",
" \"Conversational AI training data\"\n",
" ]\n",
" },\n",
" \"Educational Content\": {\n",
" \"type\": \"TEXT\",\n",
" \"subclass\": [\n",
" \"Textbook content and lecture notes\",\n",
" \"Exam questions and academic exercises\",\n",
" \"E-learning course materials\"\n",
" ]\n",
" },\n",
" \"Creative Writing\": {\n",
" \"type\": \"TEXT\",\n",
" \"subclass\": [\n",
" \"Poetry and prose\",\n",
" \"Scripts for plays, movies, and television\",\n",
" \"Song lyrics\"\n",
" ]\n",
" },\n",
" \"Technical Documentation\": {\n",
" \"type\": \"TEXT\",\n",
" \"subclass\": [\n",
" \"Manuals and user guides\",\n",
" \"Technical specifications and API documentation\",\n",
" \"Helpdesk articles and FAQs\"\n",
" ]\n",
" },\n",
" \"Legal and Regulatory Documents\": {\n",
" \"type\": \"TEXT\",\n",
" \"subclass\": [\n",
" \"Contracts and agreements\",\n",
" \"Laws, regulations, and legal case documents\",\n",
" \"Policy documents and compliance materials\"\n",
" ]\n",
" },\n",
" \"Medical and Scientific Texts\": {\n",
" \"type\": \"TEXT\",\n",
" \"subclass\": [\n",
" \"Clinical trial reports\",\n",
" \"Patient records and case notes\",\n",
" \"Scientific journal articles\"\n",
" ]\n",
" },\n",
" \"Financial and Business Documents\": {\n",
" \"type\": \"TEXT\",\n",
" \"subclass\": [\n",
" \"Financial reports and statements\",\n",
" \"Business plans and proposals\",\n",
" \"Market research and analysis reports\"\n",
" ]\n",
" },\n",
" \"Advertising and Marketing Materials\": {\n",
" \"type\": \"TEXT\",\n",
" \"subclass\": [\n",
" \"Ad copies and marketing slogans\",\n",
" \"Product catalogs and brochures\",\n",
" \"Press releases and promotional content\"\n",
" ]\n",
" },\n",
" \"Emails and Correspondence\": {\n",
" \"type\": \"TEXT\",\n",
" \"subclass\": [\n",
" \"Professional and formal correspondence\",\n",
" \"Personal emails and letters\"\n",
" ]\n",
" },\n",
" \"Metadata and Annotations\": {\n",
" \"type\": \"TEXT\",\n",
" \"subclass\": [\n",
" \"Image and video captions\",\n",
" \"Annotations and metadata for various media\"\n",
" ]\n",
" },\n",
" \"Language Learning Materials\": {\n",
" \"type\": \"TEXT\",\n",
" \"subclass\": [\n",
" \"Vocabulary lists and grammar rules\",\n",
" \"Language exercises and quizzes\"\n",
" ]\n",
" },\n",
" \"Audio Content\": {\n",
" \"type\": \"AUDIO\",\n",
" \"subclass\": [\n",
" \"Music tracks and albums\",\n",
" \"Podcasts and radio broadcasts\",\n",
" \"Audiobooks and audio guides\",\n",
" \"Recorded interviews and speeches\",\n",
" \"Sound effects and ambient sounds\"\n",
" ]\n",
" },\n",
" \"Image Content\": {\n",
" \"type\": \"IMAGE\",\n",
" \"subclass\": [\n",
" \"Photographs and digital images\",\n",
" \"Illustrations, diagrams, and charts\",\n",
" \"Infographics and visual data representations\",\n",
" \"Artwork and paintings\",\n",
" \"Screenshots and graphical user interfaces\"\n",
" ]\n",
" },\n",
" \"Video Content\": {\n",
" \"type\": \"VIDEO\",\n",
" \"subclass\": [\n",
" \"Movies and short films\",\n",
" \"Documentaries and educational videos\",\n",
" \"Video tutorials and how-to guides\",\n",
" \"Animated features and cartoons\",\n",
" \"Live event recordings and sports broadcasts\"\n",
" ]\n",
" },\n",
" \"Multimedia Content\": {\n",
" \"type\": \"MULTIMEDIA\",\n",
" \"subclass\": [\n",
" \"Interactive web content and games\",\n",
" \"Virtual reality (VR) and augmented reality (AR) experiences\",\n",
" \"Mixed media presentations and slide decks\",\n",
" \"E-learning modules with integrated multimedia\",\n",
" \"Digital exhibitions and virtual tours\"\n",
" ]\n",
" },\n",
" \"3D Models and CAD Content\": {\n",
" \"type\": \"3D_MODEL\",\n",
" \"subclass\": [\n",
" \"Architectural renderings and building plans\",\n",
" \"Product design models and prototypes\",\n",
" \"3D animations and character models\",\n",
" \"Scientific simulations and visualizations\",\n",
" \"Virtual objects for AR/VR environments\"\n",
" ]\n",
" },\n",
" \"Procedural Content\": {\n",
" \"type\": \"PROCEDURAL\",\n",
" \"subclass\": [\n",
" \"Tutorials and step-by-step guides\",\n",
" \"Workflow and process descriptions\",\n",
" \"Simulation and training exercises\",\n",
" \"Recipes and crafting instructions\"\n",
" ]\n",
" }\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "89a3f3a0",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/borisarzentar/Projects/Topoteretes/cognee/.venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n",
"\u001b[32m2024-03-02 11:50:12.400\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mfastembed.embedding\u001b[0m:\u001b[36m<module>\u001b[0m:\u001b[36m7\u001b[0m - \u001b[33m\u001b[1mDefaultEmbedding, FlagEmbedding, JinaEmbedding are deprecated.Use from fastembed import TextEmbedding instead.\u001b[0m\n"
]
}
],
"source": [
"import uuid\n",
"from os import path\n",
"from qdrant_client.models import PointStruct, Distance\n",
"from cognitive_architecture.root_dir import ROOT_DIR\n",
"from fastembed import TextEmbedding\n",
"# from cognitive_architecture.openai_tools import get_embedding_with_backoff\n",
"from cognitive_architecture.infrastructure.databases.vector import QDrantAdapter, CollectionConfig"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "659e327e",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"True\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Fetching 9 files: 0%| | 0/9 [00:00<?, ?it/s]\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"model_quantized.onnx: 100%|██████████| 279M/279M [08:28<00:00, 548kB/s]\n",
"\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"model.onnx: 100%|██████████| 1.11G/1.11G [24:44<00:00, 748kB/s]\n",
"Fetching 9 files: 100%|██████████| 9/9 [24:46<00:00, 165.16s/it]\n"
]
},
{
"ename": "ValidationError",
"evalue": "2 validation errors for PointStruct\nvector.list[float]\n Input should be a valid list [type=list_type, input_value=None, input_type=NoneType]\n For further information visit https://errors.pydantic.dev/2.6/v/list_type\nvector.dict[str,union[SparseVector,list[float]]]\n Input should be a valid dictionary [type=dict_type, input_value=None, input_type=NoneType]\n For further information visit https://errors.pydantic.dev/2.6/v/dict_type",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mValidationError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[5], line 39\u001b[0m\n\u001b[1;32m 36\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m qdrant_client\u001b[38;5;241m.\u001b[39mcreate_data_points(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtest_collection\u001b[39m\u001b[38;5;124m\"\u001b[39m, data_points)\n\u001b[1;32m 37\u001b[0m \u001b[38;5;28mprint\u001b[39m(result)\n\u001b[0;32m---> 39\u001b[0m \u001b[38;5;28;01mawait\u001b[39;00m run()\n",
"Cell \u001b[0;32mIn[5], line 36\u001b[0m, in \u001b[0;36mrun\u001b[0;34m()\u001b[0m\n\u001b[1;32m 28\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m qdrant_client\u001b[38;5;241m.\u001b[39mcreate_collection(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtest_collection\u001b[39m\u001b[38;5;124m\"\u001b[39m, CollectionConfig(\n\u001b[1;32m 29\u001b[0m vector_config \u001b[38;5;241m=\u001b[39m {\n\u001b[1;32m 30\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msize\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;241m1536\u001b[39m,\n\u001b[1;32m 31\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdistance\u001b[39m\u001b[38;5;124m\"\u001b[39m: Distance\u001b[38;5;241m.\u001b[39mDOT\n\u001b[1;32m 32\u001b[0m }\n\u001b[1;32m 33\u001b[0m ))\n\u001b[1;32m 34\u001b[0m \u001b[38;5;28mprint\u001b[39m(result)\n\u001b[0;32m---> 36\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m qdrant_client\u001b[38;5;241m.\u001b[39mcreate_data_points(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtest_collection\u001b[39m\u001b[38;5;124m\"\u001b[39m, data_points)\n\u001b[1;32m 37\u001b[0m \u001b[38;5;28mprint\u001b[39m(result)\n",
"File \u001b[0;32m~/Projects/Topoteretes/cognee/cognitive_architecture/infrastructure/databases/vector/qdrant/QDrantAdapter.py:58\u001b[0m, in \u001b[0;36mQDrantAdapter.create_data_points\u001b[0;34m(self, collection_name, data_points)\u001b[0m\n\u001b[1;32m 55\u001b[0m \u001b[38;5;28;01masync\u001b[39;00m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mcreate_data_points\u001b[39m(\u001b[38;5;28mself\u001b[39m, collection_name: \u001b[38;5;28mstr\u001b[39m, data_points: List[\u001b[38;5;28many\u001b[39m]):\n\u001b[1;32m 56\u001b[0m client \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mget_qdrant_client()\n\u001b[0;32m---> 58\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mawait\u001b[39;00m client\u001b[38;5;241m.\u001b[39mupload_points(\n\u001b[1;32m 59\u001b[0m collection_name \u001b[38;5;241m=\u001b[39m collection_name,\n\u001b[1;32m 60\u001b[0m points \u001b[38;5;241m=\u001b[39m data_points\n\u001b[1;32m 61\u001b[0m )\n",
"File \u001b[0;32m~/Projects/Topoteretes/cognee/.venv/lib/python3.12/site-packages/qdrant_client/async_qdrant_client.py:1800\u001b[0m, in \u001b[0;36mAsyncQdrantClient.upload_points\u001b[0;34m(self, collection_name, points, batch_size, parallel, method, max_retries, wait, shard_key_selector, **kwargs)\u001b[0m\n\u001b[1;32m 1776\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Upload points to the collection\u001b[39;00m\n\u001b[1;32m 1777\u001b[0m \n\u001b[1;32m 1778\u001b[0m \u001b[38;5;124;03mSimilar to `upload_collection` method, but operates with points, rather than vector and payload individually.\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1797\u001b[0m \n\u001b[1;32m 1798\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 1799\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(kwargs) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m, \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mUnknown arguments: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlist\u001b[39m(kwargs\u001b[38;5;241m.\u001b[39mkeys())\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m-> 1800\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mawait\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_client\u001b[38;5;241m.\u001b[39mupload_points(\n\u001b[1;32m 1801\u001b[0m collection_name\u001b[38;5;241m=\u001b[39mcollection_name,\n\u001b[1;32m 1802\u001b[0m points\u001b[38;5;241m=\u001b[39mpoints,\n\u001b[1;32m 1803\u001b[0m batch_size\u001b[38;5;241m=\u001b[39mbatch_size,\n\u001b[1;32m 1804\u001b[0m parallel\u001b[38;5;241m=\u001b[39mparallel,\n\u001b[1;32m 1805\u001b[0m method\u001b[38;5;241m=\u001b[39mmethod,\n\u001b[1;32m 1806\u001b[0m max_retries\u001b[38;5;241m=\u001b[39mmax_retries,\n\u001b[1;32m 1807\u001b[0m wait\u001b[38;5;241m=\u001b[39mwait,\n\u001b[1;32m 1808\u001b[0m shard_key_selector\u001b[38;5;241m=\u001b[39mshard_key_selector,\n\u001b[1;32m 1809\u001b[0m )\n",
"File \u001b[0;32m~/Projects/Topoteretes/cognee/.venv/lib/python3.12/site-packages/qdrant_client/local/async_qdrant_local.py:661\u001b[0m, in \u001b[0;36mAsyncQdrantLocal.upload_points\u001b[0;34m(self, collection_name, points, **kwargs)\u001b[0m\n\u001b[1;32m 658\u001b[0m \u001b[38;5;28;01masync\u001b[39;00m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mupload_points\u001b[39m(\n\u001b[1;32m 659\u001b[0m \u001b[38;5;28mself\u001b[39m, collection_name: \u001b[38;5;28mstr\u001b[39m, points: Iterable[types\u001b[38;5;241m.\u001b[39mPointStruct], \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs: Any\n\u001b[1;32m 660\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 661\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_upload_points\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcollection_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpoints\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[0;32m~/Projects/Topoteretes/cognee/.venv/lib/python3.12/site-packages/qdrant_client/local/async_qdrant_local.py:673\u001b[0m, in \u001b[0;36mAsyncQdrantLocal._upload_points\u001b[0;34m(self, collection_name, points)\u001b[0m\n\u001b[1;32m 668\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_upload_points\u001b[39m(\n\u001b[1;32m 669\u001b[0m \u001b[38;5;28mself\u001b[39m, collection_name: \u001b[38;5;28mstr\u001b[39m, points: Iterable[Union[types\u001b[38;5;241m.\u001b[39mPointStruct, types\u001b[38;5;241m.\u001b[39mRecord]]\n\u001b[1;32m 670\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 671\u001b[0m collection \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_get_collection(collection_name)\n\u001b[1;32m 672\u001b[0m collection\u001b[38;5;241m.\u001b[39mupsert(\n\u001b[0;32m--> 673\u001b[0m \u001b[43m[\u001b[49m\n\u001b[1;32m 674\u001b[0m \u001b[43m \u001b[49m\u001b[43mrest_models\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mPointStruct\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 675\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mid\u001b[39;49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpoint\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mid\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mvector\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpoint\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvector\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43m{\u001b[49m\u001b[43m}\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpayload\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpoint\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpayload\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43m{\u001b[49m\u001b[43m}\u001b[49m\n\u001b[1;32m 676\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 677\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mpoint\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mpoints\u001b[49m\n\u001b[1;32m 678\u001b[0m \u001b[43m \u001b[49m\u001b[43m]\u001b[49m\n\u001b[1;32m 679\u001b[0m )\n",
"Cell \u001b[0;32mIn[5], line 17\u001b[0m, in \u001b[0;36mcreate_data_point\u001b[0;34m(data)\u001b[0m\n\u001b[1;32m 16\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mcreate_data_point\u001b[39m(data: \u001b[38;5;28mstr\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m PointStruct:\n\u001b[0;32m---> 17\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mPointStruct\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 18\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mid\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;28;43mstr\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43muuid\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43muuid4\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 19\u001b[0m \u001b[43m \u001b[49m\u001b[43mvector\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m \u001b[49m\u001b[43membed_data\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdata\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 20\u001b[0m \u001b[43m \u001b[49m\u001b[43mpayload\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m \u001b[49m\u001b[43m{\u001b[49m\n\u001b[1;32m 21\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mraw\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mdata\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 22\u001b[0m \u001b[43m \u001b[49m\u001b[43m}\u001b[49m\n\u001b[1;32m 23\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[0;32m~/Projects/Topoteretes/cognee/.venv/lib/python3.12/site-packages/pydantic/main.py:171\u001b[0m, in \u001b[0;36mBaseModel.__init__\u001b[0;34m(self, **data)\u001b[0m\n\u001b[1;32m 169\u001b[0m \u001b[38;5;66;03m# `__tracebackhide__` tells pytest and some other tools to omit this function from tracebacks\u001b[39;00m\n\u001b[1;32m 170\u001b[0m __tracebackhide__ \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[0;32m--> 171\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m__pydantic_validator__\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvalidate_python\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdata\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mself_instance\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m)\u001b[49m\n",
"\u001b[0;31mValidationError\u001b[0m: 2 validation errors for PointStruct\nvector.list[float]\n Input should be a valid list [type=list_type, input_value=None, input_type=NoneType]\n For further information visit https://errors.pydantic.dev/2.6/v/list_type\nvector.dict[str,union[SparseVector,list[float]]]\n Input should be a valid dictionary [type=dict_type, input_value=None, input_type=NoneType]\n For further information visit https://errors.pydantic.dev/2.6/v/dict_type"
]
}
],
"source": [
"database_path = path.join(path.abspath(ROOT_DIR), \"database/data\", \"vector_retrieval_demo.db\")\n",
"\n",
"try:\n",
" import shutil\n",
" shutil.rmtree(database_path)\n",
"except Exception as exception:\n",
" print(exception)\n",
" pass\n",
"\n",
"def embed_data(data: str):\n",
" embedding_engine = TextEmbedding(model_name = \"sentence-transformers/paraphrase-multilingual-mpnet-base-v2\")\n",
" embedding_engine.embed(documents = [data])\n",
"\n",
"qdrant_client = QDrantAdapter(qdrant_path = database_path, qdrant_url = None, qdrant_api_key = None)\n",
"\n",
"def create_data_point(data: str) -> PointStruct:\n",
" return PointStruct(\n",
" id = str(uuid.uuid4()),\n",
" vector = embed_data(data),\n",
" payload = {\n",
" \"raw\": data,\n",
" }\n",
" )\n",
"\n",
"data_points = map(create_data_point, articles)\n",
"\n",
"async def run():\n",
" result = await qdrant_client.create_collection(\"test_collection\", CollectionConfig(\n",
" vector_config = {\n",
" \"size\": 1536,\n",
" \"distance\": Distance.DOT\n",
" }\n",
" ))\n",
" print(result)\n",
"\n",
" result = await qdrant_client.create_data_points(\"test_collection\", data_points)\n",
" print(result)\n",
"\n",
"await run()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4b6163a2",
"metadata": {},
"outputs": [],
"source": [
"qdrant_client = QDrantAdapter(qdrant_path = database_path, qdrant_url = None, qdrant_api_key = None)\n",
"\n",
"query_vector = embed_data(\"Last emperor of India\")\n",
"\n",
"results = await qdrant_client.find_related_data_points(collection_name = \"test_collection\", query_vector = query_vector)\n",
"\n",
"import json\n",
"\n",
"for result in results:\n",
" print(result.score)\n",
"\n",
"results[0].payload[\"raw\"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3d18b00f",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}