Ollama fixes, missing libs + config fixes

2024-03-26 17:38:16 +01:00 · 2024-03-26 17:38:16 +01:00 · 4e1b2db8ae
commit 4e1b2db8ae
parent 4842443206 a87d627ec2
18 changed files with 407 additions and 3176 deletions
--- a/cognee/api/v1/add/add.py
+++ b/cognee/api/v1/add/add.py
@ -3,7 +3,6 @@ from os import path, listdir
 import asyncio
 import dlt
 import duckdb
-from unstructured.cleaners.core import clean
 from cognee.root_dir import get_absolute_path
 import cognee.modules.ingestion as ingestion
 from cognee.infrastructure.files import get_file_metadata
@ -19,11 +18,11 @@ async def add(file_paths: Union[str, List[str]], dataset_name: str = None):
            for file_or_dir in listdir(root_dir_path):
                if path.isdir(path.join(root_dir_path, file_or_dir)):
                    dataset_name = file_or_dir if parent_dir == "root" else parent_dir + "." + file_or_dir
-                    dataset_name = clean(dataset_name.replace(" ", "_"))
+                    dataset_name = dataset_name.strip().replace(" ", "_")

                    nested_datasets = list_dir_files(path.join(root_dir_path, file_or_dir), dataset_name)

-                    for dataset in nested_datasets:
+                    for dataset in nested_datasets.keys():
                        datasets[dataset] = nested_datasets[dataset]
                else:
                    if parent_dir not in datasets:
@ -37,7 +36,7 @@ async def add(file_paths: Union[str, List[str]], dataset_name: str = None):

        results = []

-        for key in datasets:
+        for key in datasets.keys():
            if dataset_name is not None and not key.startswith(dataset_name):
                continue

--- a/cognee/api/v1/cognify/cognify.py
+++ b/cognee/api/v1/cognify/cognify.py
@ -3,8 +3,7 @@ import asyncio
 from typing import List, Union
 import instructor
 from openai import OpenAI
-from unstructured.cleaners.core import clean
-from unstructured.partition.pdf import partition_pdf
+from pypdf import PdfReader
 from cognee.modules.cognify.graph.add_classification_nodes import add_classification_nodes
 from cognee.modules.cognify.llm.label_content import label_content
 from cognee.modules.cognify.graph.add_label_nodes import add_label_nodes
@ -27,7 +26,6 @@ from cognee.shared.data_models import GraphDBType
 from cognee.infrastructure.databases.relational import DuckDBAdapter
 from cognee.modules.cognify.graph.add_document_node import add_document_node
 from cognee.modules.cognify.graph.initialize_graph import initialize_graph
-from cognee.infrastructure.databases.vector  import CollectionConfig, VectorConfig
 from cognee.infrastructure import infrastructure_config

 config = Config()
@ -62,8 +60,9 @@ async def cognify(datasets: Union[str, List[str]] = None, graphdatamodel: object

    for file_metadata in files_metadata:
        with open(file_metadata["file_path"], "rb") as file:
-            elements = partition_pdf(file = file, strategy = "fast")
-            text = "\n".join(map(lambda element: clean(element.text), elements))
+            reader = PdfReader(stream = file)
+            pages = list(reader.pages[:3])
+            text = "\n".join([page.extract_text().strip() for page in pages])

            awaitables.append(process_text(text, file_metadata))

@ -159,18 +158,11 @@ async def process_text(input_text: str, file_metadata: dict):

    unique_layers = nodes_by_layer.keys()

-    collection_config = CollectionConfig(
-        vector_config = VectorConfig(
-            distance = "Cosine",
-            size = 3072
-        )
-    )
-
    try:
        db_engine = infrastructure_config.get_config()["vector_engine"]

        for layer in unique_layers:
-            await db_engine.create_collection(layer, collection_config)
+            await db_engine.create_collection(layer)
    except Exception as e:
        print(e)

@ -178,7 +170,6 @@ async def process_text(input_text: str, file_metadata: dict):

    results = await resolve_cross_graph_references(nodes_by_layer)

-
    relationships = graph_ready_output(results)
    # print(relationships)
    await graph_client.load_graph_from_file()
--- a/cognee/infrastructure/InfrastructureConfig.py
+++ b/cognee/infrastructure/InfrastructureConfig.py
@ -3,6 +3,7 @@ from .databases.relational import SqliteEngine, DatabaseEngine
 from .databases.vector import WeaviateAdapter, VectorDBInterface
 from .llm.llm_interface import LLMInterface
 from .llm.openai.adapter import OpenAIAdapter
+from .databases.vector import WeaviateAdapter, VectorDBInterface, DefaultEmbeddingEngine

 config = Config()
 config.load()
--- a/cognee/infrastructure/databases/vector/init.py
+++ b/cognee/infrastructure/databases/vector/init.py
@ -5,3 +5,4 @@ from .models.VectorConfig import VectorConfig
 from .models.CollectionConfig import CollectionConfig
 from .weaviate_db import WeaviateAdapter
 from .vector_db_interface import VectorDBInterface
+from .embeddings.DefaultEmbeddingEngine import DefaultEmbeddingEngine
--- a/cognee/infrastructure/databases/vector/embeddings/DefaultEmbeddingEngine.py
+++ b/cognee/infrastructure/databases/vector/embeddings/DefaultEmbeddingEngine.py
@ -0,0 +1,13 @@
+from typing import List
+from fastembed import TextEmbedding
+from .EmbeddingEngine import EmbeddingEngine
+
+class DefaultEmbeddingEngine(EmbeddingEngine):
+    async def embed_text(self, text: List[str]) -> List[float]:
+        embedding_model = TextEmbedding(model_name = "BAAI/bge-large-en-v1.5")
+        embeddings_list = list(map(lambda embedding: embedding.tolist(), embedding_model.embed(text)))
+
+        return embeddings_list
+
+    def get_vector_size(self) -> int:
+        return 1024
--- a/cognee/infrastructure/databases/vector/embeddings/EmbeddingEngine.py
+++ b/cognee/infrastructure/databases/vector/embeddings/EmbeddingEngine.py
@ -0,0 +1,8 @@
+from typing import List, Protocol
+
+class EmbeddingEngine(Protocol):
+    async def embed_text(self, text: str) -> List[float]:
+        raise NotImplementedError()
+
+    def get_vector_size(self) -> int:
+        raise NotImplementedError()
--- a/cognee/infrastructure/databases/vector/qdrant/QDrantAdapter.py
+++ b/cognee/infrastructure/databases/vector/qdrant/QDrantAdapter.py
@ -1,12 +1,10 @@
 import asyncio
-from typing import List, Dict
-# from pydantic import BaseModel, Field
+from typing import List, Dict, Optional
 from qdrant_client import AsyncQdrantClient, models
 from ..vector_db_interface import VectorDBInterface
 from ..models.DataPoint import DataPoint
-from ..models.VectorConfig import VectorConfig
 from ..models.CollectionConfig import CollectionConfig
-from cognee.infrastructure.llm.get_llm_client import get_llm_client
+from ..embeddings.EmbeddingEngine import EmbeddingEngine

 # class CollectionConfig(BaseModel, extra = "forbid"):
 #     vector_config: Dict[str, models.VectorParams] = Field(..., description="Vectors configuration" )
@ -14,26 +12,6 @@ from cognee.infrastructure.llm.get_llm_client import get_llm_client
 #     optimizers_config: Optional[models.OptimizersConfig] = Field(default = None, description="Optimizers configuration")
 #     quantization_config: Optional[models.QuantizationConfig] = Field(default = None, description="Quantization configuration")

-async def embed_data(data: str):
-    llm_client = get_llm_client()
-
-    return await llm_client.async_get_embedding_with_backoff(data)
-
-async def convert_to_qdrant_point(data_point: DataPoint):
-    return models.PointStruct(
-        id = data_point.id,
-        payload = data_point.payload,
-        vector = {
-            "text": await embed_data(data_point.get_embeddable_data())
-        }
-    )
-
-def create_vector_config(vector_config: VectorConfig):
-    return models.VectorParams(
-        size = vector_config.size,
-        distance = vector_config.distance
-    )
-
 def create_hnsw_config(hnsw_config: Dict):
    if hnsw_config is not None:
        return models.HnswConfig()
@ -54,7 +32,9 @@ class QDrantAdapter(VectorDBInterface):
    qdrant_path: str = None
    qdrant_api_key: str = None

-    def __init__(self, qdrant_path, qdrant_url, qdrant_api_key):
+    def __init__(self, qdrant_path, qdrant_url, qdrant_api_key, embedding_engine: EmbeddingEngine):
+        self.embedding_engine = embedding_engine
+
        if qdrant_path is not None:
            self.qdrant_path = qdrant_path
        else:
@ -77,17 +57,22 @@ class QDrantAdapter(VectorDBInterface):
            location = ":memory:"
        )

+    async def embed_data(self, data: List[str]) -> List[float]:
+        return await self.embedding_engine.embed_text(data)
+
    async def create_collection(
      self,
      collection_name: str,
-      collection_config: CollectionConfig,
    ):
        client = self.get_qdrant_client()

        return await client.create_collection(
            collection_name = collection_name,
            vectors_config = {
-                "text": create_vector_config(collection_config.vector_config)
+                "text": models.VectorParams(
+                    size = self.embedding_engine.get_vector_size(),
+                    distance = "Cosine"
+                )
            }
        )

@ -96,6 +81,17 @@ class QDrantAdapter(VectorDBInterface):

        awaitables = []

+        data_vectors = await self.embed_data(list(map(lambda data_point: data_point.get_embeddable_data(), data_points)))
+
+        async def convert_to_qdrant_point(data_point: DataPoint):
+            return models.PointStruct(
+                id = data_point.id,
+                payload = data_point.payload,
+                vector = {
+                    "text": data_vectors[data_points.index(data_point)]
+                }
+            )
+
        for point in data_points:
            awaitables.append(convert_to_qdrant_point(point))

@ -106,21 +102,31 @@ class QDrantAdapter(VectorDBInterface):
            points = points
        )

-    async def search(self, collection_name: str, query_text: str, limit: int, with_vector: bool = False):
+    async def search(
+        self,
+        collection_name: str,
+        query_text: Optional[str] = None,
+        query_vector: Optional[List[float]] = None,
+        limit: int = None,
+        with_vector: bool = False
+    ):
+        if query_text is None and query_vector is None:
+            raise ValueError("One of query_text or query_vector must be provided!")
+
        client = self.get_qdrant_client()

        return await client.search(
            collection_name = collection_name,
            query_vector = models.NamedVector(
                name = "text",
-                vector = await embed_data(query_text)
+                vector = query_vector if query_vector is not None else (await self.embed_data([query_text]))[0],
            ),
            limit = limit,
            with_vectors = with_vector
        )


-    async def batch_search(self, collection_name: str, query_texts: List[str], limit: int, with_vectors: bool = False):
+    async def batch_search(self, collection_name: str, query_texts: List[str], limit: int = None, with_vectors: bool = False):
        """
        Perform batch search in a Qdrant collection with dynamic search requests.

@ -134,9 +140,7 @@ class QDrantAdapter(VectorDBInterface):
        - results: The search results from Qdrant.
        """

-        client = self.get_qdrant_client()
-
-        vectors = await asyncio.gather(*[embed_data(query_text) for query_text in query_texts])
+        vectors = await self.embed_data(query_texts)

        # Generate dynamic search requests based on the provided embeddings
        requests = [
@ -150,6 +154,8 @@ class QDrantAdapter(VectorDBInterface):
            ) for vector in vectors
        ]

+        client = self.get_qdrant_client()
+
        # Perform batch search with the dynamically generated requests
        results = await client.search_batch(
            collection_name = collection_name,
--- a/cognee/infrastructure/databases/vector/vector_db_interface.py
+++ b/cognee/infrastructure/databases/vector/vector_db_interface.py
@ -1,6 +1,5 @@
-from typing import List, Protocol
+from typing import List, Protocol, Optional
 from abc import abstractmethod
-from .models.CollectionConfig import CollectionConfig
 from .models.DataPoint import DataPoint

 class VectorDBInterface(Protocol):
@ -8,8 +7,7 @@ class VectorDBInterface(Protocol):
    @abstractmethod
    async def create_collection(
        self,
-        collection_name: str,
-        collection_config: CollectionConfig
+        collection_name: str
    ): raise NotImplementedError

    # @abstractmethod
@ -74,7 +72,8 @@ class VectorDBInterface(Protocol):
    async def search(
        self,
        collection_name: str,
-        query_text: str,
+        query_text: Optional[str],
+        query_vector: Optional[List[float]],
        limit: int,
        with_vector: bool = False

--- a/cognee/infrastructure/databases/vector/weaviate_db/WeaviateAdapter.py
+++ b/cognee/infrastructure/databases/vector/weaviate_db/WeaviateAdapter.py
@ -1,4 +1,4 @@
-from typing import List
+from typing import List, Optional
 from multiprocessing import Pool
 import weaviate
 import weaviate.classes as wvc
@ -7,29 +7,35 @@ from weaviate.classes.data import DataObject
 from ..vector_db_interface import VectorDBInterface
 from ..models.DataPoint import DataPoint
 from ..models.ScoredResult import ScoredResult
+from ..embeddings.EmbeddingEngine import EmbeddingEngine

 class WeaviateAdapter(VectorDBInterface):
    async_pool: Pool = None
+    embedding_engine: EmbeddingEngine = None
+
+    def __init__(self, url: str, api_key: str, embedding_engine: EmbeddingEngine):
+        self.embedding_engine = embedding_engine

-    def __init__(self, url: str, api_key: str, openai_api_key: str):
        self.client = weaviate.connect_to_wcs(
            cluster_url = url,
            auth_credentials = weaviate.auth.AuthApiKey(api_key),
-            headers = {
-                "X-OpenAI-Api-Key": openai_api_key
-            },
-            additional_config = wvc.init.AdditionalConfig(timeou = wvc.init.Timeout(init = 30))
+            # headers = {
+            #     "X-OpenAI-Api-Key": openai_api_key
+            # },
+            additional_config = wvc.init.AdditionalConfig(timeout = wvc.init.Timeout(init = 30))
        )

-    async def create_collection(self, collection_name: str, collection_config: dict):
+    async def embed_data(self, data: List[str]) -> List[float]:
+        return await self.embedding_engine.embed_text(data)
+
+    async def create_collection(self, collection_name: str):
        return self.client.collections.create(
            name = collection_name,
-            vectorizer_config = wvcc.Configure.Vectorizer.text2vec_openai(),
-            generative_config = wvcc.Configure.Generative.openai(),
            properties = [
                wvcc.Property(
                    name = "text",
-                    data_type = wvcc.DataType.TEXT
+                    data_type = wvcc.DataType.TEXT,
+                    skip_vectorization = True
                )
            ]
        )
@ -38,26 +44,38 @@ class WeaviateAdapter(VectorDBInterface):
        return self.client.collections.get(collection_name)

    async def create_data_points(self, collection_name: str, data_points: List[DataPoint]):
+        data_vectors = await self.embed_data(list(map(lambda data_point: data_point.get_embeddable_data(), data_points)))
+
        def convert_to_weaviate_data_points(data_point: DataPoint):
            return DataObject(
                uuid = data_point.id,
-                properties = data_point.payload
+                properties = data_point.payload,
+                vector = data_vectors[data_points.index(data_point)]
            )

        objects = list(map(convert_to_weaviate_data_points, data_points))

        return self.get_collection(collection_name).data.insert_many(objects)

-    async def search(self, collection_name: str, query_text: str, limit: int, with_vector: bool = False):
+    async def search(
+        self,
+        collection_name: str,
+        query_text: Optional[str] = None,
+        query_vector: Optional[List[float]] = None,
+        limit: int = None,
+        with_vector: bool = False
+    ):
+        if query_text is None and query_vector is None:
+            raise ValueError("One of query_text or query_vector must be provided!")
+    
        search_result = self.get_collection(collection_name).query.hybrid(
-            query = query_text,
+            query = None,
+            vector = query_vector if query_vector is not None else (await self.embed_data([query_text]))[0],
            limit = limit,
            include_vector = with_vector,
            return_metadata = wvc.query.MetadataQuery(score = True),
        )

-        # print(search_result.objects)
-
        return list(map(lambda result: ScoredResult(
            id = str(result.uuid),
            payload = result.properties,
@ -65,10 +83,10 @@ class WeaviateAdapter(VectorDBInterface):
        ), search_result.objects))

    async def batch_search(self, collection_name: str, query_texts: List[str], limit: int,  with_vectors: bool = False):
-        def query_search(query_text):
-            return self.search(collection_name, query_text, limit = limit, with_vector = with_vectors)
+        def query_search(query_vector):
+            return self.search(collection_name, query_vector = query_vector, limit = limit, with_vector = with_vectors)

-        return [await query_search(query_text) for query_text in query_texts]
+        return [await query_search(query_vector) for query_vector in await self.embed_data(query_texts)]

    async def prune(self):
        self.client.collections.delete_all()
--- a/cognee/infrastructure/files/utils/get_file_metadata.py
+++ b/cognee/infrastructure/files/utils/get_file_metadata.py
@ -1,10 +1,8 @@
 from typing import BinaryIO, TypedDict
 import filetype
-from unstructured.cleaners.core import clean
-from unstructured.partition.pdf import partition_pdf
+from pypdf import PdfReader
 from .extract_keywords import extract_keywords

-
 class FileTypeException(Exception):
    message: str

@ -27,10 +25,10 @@ def get_file_metadata(file: BinaryIO) -> FileMetadata:
    keywords: list = []

    if file_type.extension == "pdf":
-        elements = partition_pdf(file = file, strategy = "fast")
-        keywords = extract_keywords(
-            "\n".join(map(lambda element: clean(element.text), elements))
-        )
+        reader = PdfReader(stream = file)
+        pages = list(reader.pages[:3])
+        text = "\n".join([page.extract_text().strip() for page in pages])
+        keywords = extract_keywords(text)

    file_path = file.name
    file_name = file_path.split("/")[-1].split(".")[0]
--- a/cognee/modules/cognify/llm/classify_content.py
+++ b/cognee/modules/cognify/llm/classify_content.py
@ -35,8 +35,9 @@ def extract_categories(llm_output) -> List[dict]:

 if __name__ == "__main__":
    import asyncio
-    asyncio.run(classify_into_categories("""Russia summons US ambassador in Moscow and says it will expel diplomats who meddle in its internal affairs
+    gg = asyncio.run(classify_into_categories("""Russia summons US ambassador in Moscow and says it will expel diplomats who meddle in its internal affairs
 The Russian foreign ministry said on Thursday it had summoned the US ambassador in Moscow and warned her against “attempts to interfere in the internal affairs of the Russian Federation”, reports Reuters.

 Ahead of a March presidential election, it said in a statement that such behaviour would be “firmly and resolutely suppressed, up to and including the expulsion as ‘persona non grata’ of US embassy staff involved in such actions”.""", "classify_content.txt", DefaultContentPrediction))
+    print(gg)

--- a/cognee/modules/cognify/llm/label_content.py
+++ b/cognee/modules/cognify/llm/label_content.py
@ -1,5 +1,5 @@
 """ This module contains the code to classify content into categories using the LLM API. """
-from typing import Type, List
+from typing import Type
 from pydantic import BaseModel
 from cognee.infrastructure.llm.prompts import read_query_prompt
 from cognee.infrastructure.llm.get_llm_client import get_llm_client
@ -11,4 +11,4 @@ async def label_content(text_input: str, system_prompt_file: str, response_model

    llm_output = await llm_client.acreate_structured_output(text_input, system_prompt, response_model)

-    return llm_output.dict()
+    return llm_output.model_dump()
--- a/cognee/modules/cognify/llm/summarize_content.py
+++ b/cognee/modules/cognify/llm/summarize_content.py
@ -1,5 +1,5 @@
 """ This module contains the code to classify content into categories using the LLM API. """
-from typing import Type, List
+from typing import Type
 from pydantic import BaseModel
 from cognee.infrastructure.llm.prompts import read_query_prompt
 from cognee.infrastructure.llm.get_llm_client import get_llm_client
@ -11,4 +11,4 @@ async def summarize_content(text_input: str, system_prompt_file: str, response_m

    llm_output = await llm_client.acreate_structured_output(text_input, system_prompt, response_model)

-    return llm_output.dict()
+    return llm_output.model_dump()
--- a/notebooks/Demo_graph.ipynb
+++ b/notebooks/Demo_graph.ipynb
@ -1496,16 +1496,7 @@
   "outputs": [],
   "source": [
    "\n",
-    "unique_layer_uuids = set(node['layer_decomposition_uuid'] for node in node_descriptions)\n",
-    "collection_config = CollectionConfig(\n",
-    "    vector_config={\n",
-    "        'content': models.VectorParams(\n",
-    "            distance=models.Distance.COSINE,\n",
-    "            size=3072\n",
-    "        )\n",
-    "    },\n",
-    "    # Set other configs as needed\n",
-    ")"
+    "unique_layer_uuids = set(node['layer_decomposition_uuid'] for node in node_descriptions)"
   ]
  },
  {
@ -1515,7 +1506,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "# await db.create_collection(\"blabla\",collection_config)"
+    "# await db.create_collection(\"blabla\")"
   ]
  },
  {
@ -1526,7 +1517,7 @@
   "outputs": [],
   "source": [
    "for layer in unique_layer_uuids:\n",
-    "    await db.create_collection(layer,collection_config)"
+    "    await db.create_collection(layer)"
   ]
  },
  {
--- a/notebooks/full_run.ipynb
+++ b/notebooks/full_run.ipynb
@ -0,0 +1,47 @@
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "38135bf7",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from cognee import add, cognify, search\n",
+        "from cognee.utils import render_graph\n",
+        "from os import listdir, path\n",
+        "\n",
+        "data_path = path.abspath(\"../.data\")\n",
+        "\n",
+        "print(data_path)\n",
+        "\n",
+        "await add(path.abspath(\"../.data\"), \"izmene\")\n",
+        "\n",
+        "graph = await cognify(\"izmene\")\n",
+        "\n",
+        "await render_graph(graph, graph_type=\"networkx\")"
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3 (ipykernel)",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.10.13"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 5
+}
--- a/notebooks/vector_retrieval_demo.ipynb
+++ b/notebooks/vector_retrieval_demo.ipynb
@ -1,525 +0,0 @@
-{
-  "cells": [
-    {
-      "cell_type": "code",
-      "execution_count": 1,
-      "id": "50d5afda-418f-436b-b467-004863193d4a",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "articles = [\n",
-        "\"\"\"Edward VII (Albert Edward; 9 November 1841 – 6 May 1910) was King of the United Kingdom and the British Dominions, and Emperor of India, from 22 January 1901 until his death in 1910.\n",
-        "The second child and eldest son of Queen Victoria and Prince Albert of Saxe-Coburg and Gotha, Edward, nicknamed \"Bertie\", was related to royalty throughout Europe. He was Prince of Wales and heir apparent to the British throne for almost 60 years. During his mother's reign, he was largely excluded from political influence and came to personify the fashionable, leisured elite. He married Princess Alexandra of Denmark in 1863, and the couple had six children. As Prince of Wales, Edward travelled throughout Britain performing ceremonial public duties and represented Britain on visits abroad. His tours of North America in 1860 and of the Indian subcontinent in 1875 proved popular successes, but despite public approval, his reputation as a playboy prince soured his relationship with his mother.\n",
-        "Edward inherited the throne upon his mother's death in 1901. The King played a role in the modernisation of the British Home Fleet and the reorganisation of the British Army after the Second Boer War of 1899–1902. He re-instituted traditional ceremonies as public displays and broadened the range of people with whom royalty socialised. He fostered good relations between Britain and other European countries, especially France, for which he was popularly called \"Peacemaker\", but his relationship with his nephew, German Emperor Wilhelm II, was poor. The Edwardian era, which covered Edward's reign and was named after him, coincided with the start of a new century and heralded significant changes in technology and society, including steam turbine propulsion and the rise of socialism. He died in 1910 in the midst of a constitutional crisis that was resolved the following year by the Parliament Act 1911, which restricted the power of the unelected House of Lords. Edward was succeeded by his only surviving son, George V.\"\"\",\n",
-        "    \"\"\"George V (George Frederick Ernest Albert; 3 June 1865 – 20 January 1936) was King of the United Kingdom and the British Dominions, and Emperor of India, from 6 May 1910 until his death in 1936.\n",
-        "George was born during the reign of his paternal grandmother, Queen Victoria, as the second son of the Prince and Princess of Wales (later King Edward VII and Queen Alexandra). He was third in the line of succession to the British throne behind his father and his elder brother, Prince Albert Victor. From 1877 to 1892, George served in the Royal Navy, until his elder brother's unexpected death in January 1892 put him directly in line for the throne. The next year, George married his brother's fiancée, Princess Victoria Mary of Teck, and they had six children. When Queen Victoria died in 1901, George's father ascended the throne as Edward VII, and George was created Prince of Wales. He became king-emperor on his father's death in 1910.\n",
-        "George's reign saw the rise of socialism, communism, fascism, Irish republicanism, and the Indian independence movement, all of which radically changed the political landscape of the British Empire, which itself reached its territorial peak by the beginning of the 1920s. The Parliament Act 1911 established the supremacy of the elected British House of Commons over the unelected House of Lords. As a result of the First World War (1914–1918), the empires of his first cousins Nicholas II of Russia and Wilhelm II of Germany fell, while the British Empire expanded to its greatest effective extent. In 1917, George became the first monarch of the House of Windsor, which he renamed from the House of Saxe-Coburg and Gotha as a result of anti-German public sentiment. He appointed the first Labour ministry in 1924, and the 1931 Statute of Westminster recognised the Empire's Dominions as separate, independent states within the British Commonwealth of Nations.\n",
-        "George suffered from smoking-related health problems during his later reign. On his death in January 1936, he was succeeded by his eldest son, Edward VIII. Edward abdicated in December of that year and was succeeded by his younger brother Albert, who took the regnal name George VI.\"\"\",\n",
-        "\"\"\"Edward VIII (Edward Albert Christian George Andrew Patrick David; 23 June 1894 – 28 May 1972), later known as the Duke of Windsor, was King of the United Kingdom and the Dominions of the British Empire, and Emperor of India, from 20 January 1936 until his abdication in December of the same year.[a]\n",
-        "Edward was born during the reign of his great-grandmother Queen Victoria as the eldest child of the Duke and Duchess of York, later King George V and Queen Mary. He was created Prince of Wales on his 16th birthday, seven weeks after his father succeeded as king. As a young man, Edward served in the British Army during the First World War and undertook several overseas tours on behalf of his father. The Prince of Wales gained popularity due to his charm and charisma, and his fashion sense became a hallmark of the era. After the war, his conduct began to give cause for concern; he engaged in a series of sexual affairs that worried both his father and the British prime minister, Stanley Baldwin.\n",
-        "Upon his father's death in 1936, Edward became the second monarch of the House of Windsor. The new king showed impatience with court protocol, and caused consternation among politicians by his apparent disregard for established constitutional conventions. Only months into his reign, a constitutional crisis was caused by his proposal to marry Wallis Simpson, an American who had divorced her first husband and was seeking a divorce from her second. The prime ministers of the United Kingdom and the Dominions opposed the marriage, arguing a divorced woman with two living ex-husbands was politically and socially unacceptable as a prospective queen consort. Additionally, such a marriage would have conflicted with Edward's status as titular head of the Church of England, which, at the time, disapproved of remarriage after divorce if a former spouse was still alive. Edward knew the Baldwin government would resign if the marriage went ahead, which could have forced a general election and would have ruined his status as a politically neutral constitutional monarch. When it became apparent he could not marry Simpson and remain on the throne, he abdicated. He was succeeded by his younger brother, George VI. With a reign of 326 days, Edward was one of the shortest-reigning British monarchs to date.\n",
-        "After his abdication, Edward was created Duke of Windsor. He married Simpson in France on 3 June 1937, after her second divorce became final. Later that year, the couple toured Nazi Germany, which fed rumours that he was a Nazi sympathiser. During the Second World War, Edward was at first stationed with the British Military Mission to France but after the fall of France was appointed Governor of the Bahamas. After the war, Edward spent the rest of his life in France. He and Wallis remained married until his death in 1972; they had no children.\"\"\",\n",
-        "\"\"\"George VI (Albert Frederick Arthur George; 14 December 1895 – 6 February 1952) was King of the United Kingdom and the Dominions of the British Commonwealth from 11 December 1936 until his death on 6 February 1952. He was also the last Emperor of India from 1936 until the British Raj was dissolved in August 1947, and the first head of the Commonwealth following the London Declaration of 1949.\n",
-        "The future George VI was born during the reign of his great-grandmother Queen Victoria; he was named Albert at birth after his great-grandfather Prince Albert of Saxe-Coburg and Gotha and was known as \"Bertie\" to his family and close friends. His father ascended the throne as George V in 1910. As the second son of the king, Albert was not expected to inherit the throne. He spent his early life in the shadow of his elder brother, Edward, the heir apparent. Albert attended naval college as a teenager and served in the Royal Navy and Royal Air Force during the First World War. In 1920, he was made Duke of York. He married Lady Elizabeth Bowes-Lyon in 1923, and they had two daughters, Elizabeth and Margaret. In the mid-1920s, he engaged speech therapist Lionel Logue to treat his stutter, which he learned to manage to some degree. His elder brother ascended the throne as Edward VIII after their father died in 1936, but Edward abdicated later that year to marry the twice-divorced American socialite Wallis Simpson. As heir presumptive to Edward VIII, Albert became king, taking the regnal name George VI.\n",
-        "In September 1939, the British Empire and most Commonwealth countries—but not Ireland—declared war on Nazi Germany, following the invasion of Poland. War with the Kingdom of Italy and the Empire of Japan followed in 1940 and 1941, respectively. George VI was seen as sharing the hardships of the common people and his popularity soared. Buckingham Palace was bombed during the Blitz while the King and Queen were there, and his younger brother the Duke of Kent was killed on active service. George became known as a symbol of British determination to win the war. Britain and its allies were victorious in 1945, but the British Empire declined. Ireland had largely broken away, followed by the independence of India and Pakistan in 1947. George relinquished the title of Emperor of India in June 1948 and instead adopted the new title of Head of the Commonwealth. He was beset by smoking-related health problems in the later years of his reign and died at Sandringham House, aged 56, of a coronary thrombosis in 1952. He was succeeded by his elder daughter, Elizabeth II.\"\"\",\n",
-        "\"\"\"Elizabeth II (Elizabeth Alexandra Mary; 21 April 1926 – 8 September 2022) was Queen of the United Kingdom and other Commonwealth realms from 6 February 1952 until her death in 2022. She was queen regnant of 32 sovereign states over the course of her lifetime and remained the monarch of 15 realms by the time of her death. Her reign of over 70 years is the longest of any British monarch, the longest of any female monarch, and the second longest verified reign of any monarch of a sovereign state in history.\n",
-        "Elizabeth was born in Mayfair, London, during the reign of her paternal grandfather, King George V. She was the first child of the Duke and Duchess of York (later King George VI and Queen Elizabeth The Queen Mother). Her father acceded to the throne in 1936 upon the abdication of his brother Edward VIII, making the ten-year-old Princess Elizabeth the heir presumptive. She was educated privately at home and began to undertake public duties during the Second World War, serving in the Auxiliary Territorial Service. In November 1947, she married Philip Mountbatten, a former prince of Greece and Denmark, and their marriage lasted 73 years until his death in 2021. They had four children: Charles, Anne, Andrew, and Edward.\n",
-        "When her father died in February 1952, Elizabeth—then 25 years old—became queen of seven independent Commonwealth countries: the United Kingdom, Canada, Australia, New Zealand, South Africa, Pakistan, and Ceylon (known today as Sri Lanka), as well as head of the Commonwealth. Elizabeth reigned as a constitutional monarch through major political changes such as the Troubles in Northern Ireland, devolution in the United Kingdom, the decolonisation of Africa, and the United Kingdom's accession to the European Communities, as well as its subsequent withdrawal. The number of her realms varied over time as territories gained independence and some realms became republics. As queen, Elizabeth was served by more than 170 prime ministers across her realms. Her many historic visits and meetings included state visits to China in 1986, to Russia in 1994, and to the Republic of Ireland in 2011, and meetings with five popes and fourteen US presidents.\n",
-        "Significant events included Elizabeth's coronation in 1953 and the celebrations of her Silver, Golden, Diamond, and Platinum jubilees in 1977, 2002, 2012, and 2022, respectively. Although she faced occasional republican sentiment and media criticism of her family—particularly after the breakdowns of her children's marriages, her annus horribilis in 1992, and the death in 1997 of her former daughter-in-law Diana—support for the monarchy in the United Kingdom remained consistently high throughout her lifetime, as did her personal popularity. Elizabeth died aged 96 at Balmoral Castle in September 2022, and was succeeded by her eldest son, Charles III.\"\"\"\n",
-        "]"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 2,
-      "id": "e328a903-d084-4d07-9b95-0a9196d7f719",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "categories = {\n",
-        "    \"Natural Language Text\": {\n",
-        "        \"type\": \"TEXT\",\n",
-        "        \"subclass\": [\n",
-        "            \"Articles, essays, and reports\",\n",
-        "            \"Books and manuscripts\",\n",
-        "            \"News stories and blog posts\",\n",
-        "            \"Research papers and academic publications\",\n",
-        "            \"Social media posts and comments\",\n",
-        "            \"Website content and product descriptions\",\n",
-        "            \"Personal narratives and stories\"\n",
-        "        ]\n",
-        "    },\n",
-        "    \"Structured Documents\": {\n",
-        "        \"type\": \"TEXT\",\n",
-        "        \"subclass\": [\n",
-        "            \"Spreadsheets and tables\",\n",
-        "            \"Forms and surveys\",\n",
-        "            \"Databases and CSV files\"\n",
-        "        ]\n",
-        "    },\n",
-        "    \"Code and Scripts\": {\n",
-        "        \"type\": \"TEXT\",\n",
-        "        \"subclass\": [\n",
-        "            \"Source code in various programming languages\",\n",
-        "            \"Shell commands and scripts\",\n",
-        "            \"Markup languages (HTML, XML)\",\n",
-        "            \"Stylesheets (CSS) and configuration files (YAML, JSON, INI)\"\n",
-        "        ]\n",
-        "    },\n",
-        "    \"Conversational Data\": {\n",
-        "        \"type\": \"TEXT\",\n",
-        "        \"subclass\": [\n",
-        "            \"Chat transcripts and messaging history\",\n",
-        "            \"Customer service logs and interactions\",\n",
-        "            \"Conversational AI training data\"\n",
-        "        ]\n",
-        "    },\n",
-        "    \"Educational Content\": {\n",
-        "        \"type\": \"TEXT\",\n",
-        "        \"subclass\": [\n",
-        "            \"Textbook content and lecture notes\",\n",
-        "            \"Exam questions and academic exercises\",\n",
-        "            \"E-learning course materials\"\n",
-        "        ]\n",
-        "    },\n",
-        "    \"Creative Writing\": {\n",
-        "        \"type\": \"TEXT\",\n",
-        "        \"subclass\": [\n",
-        "            \"Poetry and prose\",\n",
-        "            \"Scripts for plays, movies, and television\",\n",
-        "            \"Song lyrics\"\n",
-        "        ]\n",
-        "    },\n",
-        "    \"Technical Documentation\": {\n",
-        "        \"type\": \"TEXT\",\n",
-        "        \"subclass\": [\n",
-        "            \"Manuals and user guides\",\n",
-        "            \"Technical specifications and API documentation\",\n",
-        "            \"Helpdesk articles and FAQs\"\n",
-        "        ]\n",
-        "    },\n",
-        "    \"Legal and Regulatory Documents\": {\n",
-        "        \"type\": \"TEXT\",\n",
-        "        \"subclass\": [\n",
-        "            \"Contracts and agreements\",\n",
-        "            \"Laws, regulations, and legal case documents\",\n",
-        "            \"Policy documents and compliance materials\"\n",
-        "        ]\n",
-        "    },\n",
-        "    \"Medical and Scientific Texts\": {\n",
-        "        \"type\": \"TEXT\",\n",
-        "        \"subclass\": [\n",
-        "            \"Clinical trial reports\",\n",
-        "            \"Patient records and case notes\",\n",
-        "            \"Scientific journal articles\"\n",
-        "        ]\n",
-        "    },\n",
-        "    \"Financial and Business Documents\": {\n",
-        "        \"type\": \"TEXT\",\n",
-        "        \"subclass\": [\n",
-        "            \"Financial reports and statements\",\n",
-        "            \"Business plans and proposals\",\n",
-        "            \"Market research and analysis reports\"\n",
-        "        ]\n",
-        "    },\n",
-        "    \"Advertising and Marketing Materials\": {\n",
-        "        \"type\": \"TEXT\",\n",
-        "        \"subclass\": [\n",
-        "            \"Ad copies and marketing slogans\",\n",
-        "            \"Product catalogs and brochures\",\n",
-        "            \"Press releases and promotional content\"\n",
-        "        ]\n",
-        "    },\n",
-        "    \"Emails and Correspondence\": {\n",
-        "        \"type\": \"TEXT\",\n",
-        "        \"subclass\": [\n",
-        "            \"Professional and formal correspondence\",\n",
-        "            \"Personal emails and letters\"\n",
-        "        ]\n",
-        "    },\n",
-        "    \"Metadata and Annotations\": {\n",
-        "        \"type\": \"TEXT\",\n",
-        "        \"subclass\": [\n",
-        "            \"Image and video captions\",\n",
-        "            \"Annotations and metadata for various media\"\n",
-        "        ]\n",
-        "    },\n",
-        "    \"Language Learning Materials\": {\n",
-        "        \"type\": \"TEXT\",\n",
-        "        \"subclass\": [\n",
-        "            \"Vocabulary lists and grammar rules\",\n",
-        "            \"Language exercises and quizzes\"\n",
-        "        ]\n",
-        "    },\n",
-        "    \"Audio Content\": {\n",
-        "    \"type\": \"AUDIO\",\n",
-        "    \"subclass\": [\n",
-        "        \"Music tracks and albums\",\n",
-        "        \"Podcasts and radio broadcasts\",\n",
-        "        \"Audiobooks and audio guides\",\n",
-        "        \"Recorded interviews and speeches\",\n",
-        "        \"Sound effects and ambient sounds\"\n",
-        "    ]\n",
-        "    },\n",
-        "    \"Image Content\": {\n",
-        "        \"type\": \"IMAGE\",\n",
-        "        \"subclass\": [\n",
-        "            \"Photographs and digital images\",\n",
-        "            \"Illustrations, diagrams, and charts\",\n",
-        "            \"Infographics and visual data representations\",\n",
-        "            \"Artwork and paintings\",\n",
-        "            \"Screenshots and graphical user interfaces\"\n",
-        "        ]\n",
-        "    },\n",
-        "    \"Video Content\": {\n",
-        "        \"type\": \"VIDEO\",\n",
-        "        \"subclass\": [\n",
-        "            \"Movies and short films\",\n",
-        "            \"Documentaries and educational videos\",\n",
-        "            \"Video tutorials and how-to guides\",\n",
-        "            \"Animated features and cartoons\",\n",
-        "            \"Live event recordings and sports broadcasts\"\n",
-        "        ]\n",
-        "    },\n",
-        "    \"Multimedia Content\": {\n",
-        "        \"type\": \"MULTIMEDIA\",\n",
-        "        \"subclass\": [\n",
-        "            \"Interactive web content and games\",\n",
-        "            \"Virtual reality (VR) and augmented reality (AR) experiences\",\n",
-        "            \"Mixed media presentations and slide decks\",\n",
-        "            \"E-learning modules with integrated multimedia\",\n",
-        "            \"Digital exhibitions and virtual tours\"\n",
-        "        ]\n",
-        "    },\n",
-        "    \"3D Models and CAD Content\": {\n",
-        "        \"type\": \"3D_MODEL\",\n",
-        "        \"subclass\": [\n",
-        "            \"Architectural renderings and building plans\",\n",
-        "            \"Product design models and prototypes\",\n",
-        "            \"3D animations and character models\",\n",
-        "            \"Scientific simulations and visualizations\",\n",
-        "            \"Virtual objects for AR/VR environments\"\n",
-        "        ]\n",
-        "    },\n",
-        "    \"Procedural Content\": {\n",
-        "        \"type\": \"PROCEDURAL\",\n",
-        "        \"subclass\": [\n",
-        "            \"Tutorials and step-by-step guides\",\n",
-        "            \"Workflow and process descriptions\",\n",
-        "            \"Simulation and training exercises\",\n",
-        "            \"Recipes and crafting instructions\"\n",
-        "        ]\n",
-        "    }\n",
-        "}"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 3,
-      "id": "89a3f3a0",
-      "metadata": {},
-      "outputs": [
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "/Users/borisarzentar/Projects/Topoteretes/cognee/.venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
-            "  from .autonotebook import tqdm as notebook_tqdm\n",
-            "\u001b[32m2024-03-02 11:50:12.400\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mfastembed.embedding\u001b[0m:\u001b[36m<module>\u001b[0m:\u001b[36m7\u001b[0m - \u001b[33m\u001b[1mDefaultEmbedding, FlagEmbedding, JinaEmbedding are deprecated.Use from fastembed import TextEmbedding instead.\u001b[0m\n"
-          ]
-        }
-      ],
-      "source": [
-        "import uuid\n",
-        "from os import path\n",
-        "from qdrant_client.models import PointStruct, Distance\n",
-        "from cognee.root_dir import ROOT_DIR\n",
-        "from fastembed import TextEmbedding\n",
-        "# from cognee.openai_tools import get_embedding_with_backoff\n",
-        "from cognee.infrastructure.databases.vector import QDrantAdapter, CollectionConfig"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 5,
-      "id": "659e327e",
-      "metadata": {},
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "True\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "Fetching 9 files:   0%|          | 0/9 [00:00<?, ?it/s]\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "model_quantized.onnx: 100%|██████████| 279M/279M [08:28<00:00, 548kB/s]\n",
-            "\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "\u001b[A\n",
-            "model.onnx: 100%|██████████| 1.11G/1.11G [24:44<00:00, 748kB/s]\n",
-            "Fetching 9 files: 100%|██████████| 9/9 [24:46<00:00, 165.16s/it]\n"
-          ]
-        },
-        {
-          "ename": "ValidationError",
-          "evalue": "2 validation errors for PointStruct\nvector.list[float]\n  Input should be a valid list [type=list_type, input_value=None, input_type=NoneType]\n    For further information visit https://errors.pydantic.dev/2.6/v/list_type\nvector.dict[str,union[SparseVector,list[float]]]\n  Input should be a valid dictionary [type=dict_type, input_value=None, input_type=NoneType]\n    For further information visit https://errors.pydantic.dev/2.6/v/dict_type",
-          "output_type": "error",
-          "traceback": [
-            "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-            "\u001b[0;31mValidationError\u001b[0m                           Traceback (most recent call last)",
-            "Cell \u001b[0;32mIn[5], line 39\u001b[0m\n\u001b[1;32m     36\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m qdrant_client\u001b[38;5;241m.\u001b[39mcreate_data_points(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtest_collection\u001b[39m\u001b[38;5;124m\"\u001b[39m, data_points)\n\u001b[1;32m     37\u001b[0m     \u001b[38;5;28mprint\u001b[39m(result)\n\u001b[0;32m---> 39\u001b[0m \u001b[38;5;28;01mawait\u001b[39;00m run()\n",
-            "Cell \u001b[0;32mIn[5], line 36\u001b[0m, in \u001b[0;36mrun\u001b[0;34m()\u001b[0m\n\u001b[1;32m     28\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m qdrant_client\u001b[38;5;241m.\u001b[39mcreate_collection(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtest_collection\u001b[39m\u001b[38;5;124m\"\u001b[39m, CollectionConfig(\n\u001b[1;32m     29\u001b[0m     vector_config \u001b[38;5;241m=\u001b[39m {\n\u001b[1;32m     30\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msize\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;241m1536\u001b[39m,\n\u001b[1;32m     31\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdistance\u001b[39m\u001b[38;5;124m\"\u001b[39m: Distance\u001b[38;5;241m.\u001b[39mDOT\n\u001b[1;32m     32\u001b[0m     }\n\u001b[1;32m     33\u001b[0m ))\n\u001b[1;32m     34\u001b[0m \u001b[38;5;28mprint\u001b[39m(result)\n\u001b[0;32m---> 36\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m qdrant_client\u001b[38;5;241m.\u001b[39mcreate_data_points(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtest_collection\u001b[39m\u001b[38;5;124m\"\u001b[39m, data_points)\n\u001b[1;32m     37\u001b[0m \u001b[38;5;28mprint\u001b[39m(result)\n",
-            "File \u001b[0;32m~/Projects/Topoteretes/cognee/cognee/infrastructure/databases/vector/qdrant/QDrantAdapter.py:58\u001b[0m, in \u001b[0;36mQDrantAdapter.create_data_points\u001b[0;34m(self, collection_name, data_points)\u001b[0m\n\u001b[1;32m     55\u001b[0m \u001b[38;5;28;01masync\u001b[39;00m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mcreate_data_points\u001b[39m(\u001b[38;5;28mself\u001b[39m, collection_name: \u001b[38;5;28mstr\u001b[39m, data_points: List[\u001b[38;5;28many\u001b[39m]):\n\u001b[1;32m     56\u001b[0m     client \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mget_qdrant_client()\n\u001b[0;32m---> 58\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mawait\u001b[39;00m client\u001b[38;5;241m.\u001b[39mupload_points(\n\u001b[1;32m     59\u001b[0m         collection_name \u001b[38;5;241m=\u001b[39m collection_name,\n\u001b[1;32m     60\u001b[0m         points \u001b[38;5;241m=\u001b[39m data_points\n\u001b[1;32m     61\u001b[0m     )\n",
-            "File \u001b[0;32m~/Projects/Topoteretes/cognee/.venv/lib/python3.12/site-packages/qdrant_client/async_qdrant_client.py:1800\u001b[0m, in \u001b[0;36mAsyncQdrantClient.upload_points\u001b[0;34m(self, collection_name, points, batch_size, parallel, method, max_retries, wait, shard_key_selector, **kwargs)\u001b[0m\n\u001b[1;32m   1776\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Upload points to the collection\u001b[39;00m\n\u001b[1;32m   1777\u001b[0m \n\u001b[1;32m   1778\u001b[0m \u001b[38;5;124;03mSimilar to `upload_collection` method, but operates with points, rather than vector and payload individually.\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m   1797\u001b[0m \n\u001b[1;32m   1798\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m   1799\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(kwargs) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m, \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mUnknown arguments: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlist\u001b[39m(kwargs\u001b[38;5;241m.\u001b[39mkeys())\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m-> 1800\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mawait\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_client\u001b[38;5;241m.\u001b[39mupload_points(\n\u001b[1;32m   1801\u001b[0m     collection_name\u001b[38;5;241m=\u001b[39mcollection_name,\n\u001b[1;32m   1802\u001b[0m     points\u001b[38;5;241m=\u001b[39mpoints,\n\u001b[1;32m   1803\u001b[0m     batch_size\u001b[38;5;241m=\u001b[39mbatch_size,\n\u001b[1;32m   1804\u001b[0m     parallel\u001b[38;5;241m=\u001b[39mparallel,\n\u001b[1;32m   1805\u001b[0m     method\u001b[38;5;241m=\u001b[39mmethod,\n\u001b[1;32m   1806\u001b[0m     max_retries\u001b[38;5;241m=\u001b[39mmax_retries,\n\u001b[1;32m   1807\u001b[0m     wait\u001b[38;5;241m=\u001b[39mwait,\n\u001b[1;32m   1808\u001b[0m     shard_key_selector\u001b[38;5;241m=\u001b[39mshard_key_selector,\n\u001b[1;32m   1809\u001b[0m )\n",
-            "File \u001b[0;32m~/Projects/Topoteretes/cognee/.venv/lib/python3.12/site-packages/qdrant_client/local/async_qdrant_local.py:661\u001b[0m, in \u001b[0;36mAsyncQdrantLocal.upload_points\u001b[0;34m(self, collection_name, points, **kwargs)\u001b[0m\n\u001b[1;32m    658\u001b[0m \u001b[38;5;28;01masync\u001b[39;00m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mupload_points\u001b[39m(\n\u001b[1;32m    659\u001b[0m     \u001b[38;5;28mself\u001b[39m, collection_name: \u001b[38;5;28mstr\u001b[39m, points: Iterable[types\u001b[38;5;241m.\u001b[39mPointStruct], \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs: Any\n\u001b[1;32m    660\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 661\u001b[0m     \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_upload_points\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcollection_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpoints\u001b[49m\u001b[43m)\u001b[49m\n",
-            "File \u001b[0;32m~/Projects/Topoteretes/cognee/.venv/lib/python3.12/site-packages/qdrant_client/local/async_qdrant_local.py:673\u001b[0m, in \u001b[0;36mAsyncQdrantLocal._upload_points\u001b[0;34m(self, collection_name, points)\u001b[0m\n\u001b[1;32m    668\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_upload_points\u001b[39m(\n\u001b[1;32m    669\u001b[0m     \u001b[38;5;28mself\u001b[39m, collection_name: \u001b[38;5;28mstr\u001b[39m, points: Iterable[Union[types\u001b[38;5;241m.\u001b[39mPointStruct, types\u001b[38;5;241m.\u001b[39mRecord]]\n\u001b[1;32m    670\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m    671\u001b[0m     collection \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_get_collection(collection_name)\n\u001b[1;32m    672\u001b[0m     collection\u001b[38;5;241m.\u001b[39mupsert(\n\u001b[0;32m--> 673\u001b[0m         \u001b[43m[\u001b[49m\n\u001b[1;32m    674\u001b[0m \u001b[43m            \u001b[49m\u001b[43mrest_models\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mPointStruct\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    675\u001b[0m \u001b[43m                \u001b[49m\u001b[38;5;28;43mid\u001b[39;49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpoint\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mid\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mvector\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpoint\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvector\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43m{\u001b[49m\u001b[43m}\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpayload\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpoint\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpayload\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43m{\u001b[49m\u001b[43m}\u001b[49m\n\u001b[1;32m    676\u001b[0m \u001b[43m            \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    677\u001b[0m \u001b[43m            \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mpoint\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mpoints\u001b[49m\n\u001b[1;32m    678\u001b[0m \u001b[43m        \u001b[49m\u001b[43m]\u001b[49m\n\u001b[1;32m    679\u001b[0m     )\n",
-            "Cell \u001b[0;32mIn[5], line 17\u001b[0m, in \u001b[0;36mcreate_data_point\u001b[0;34m(data)\u001b[0m\n\u001b[1;32m     16\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mcreate_data_point\u001b[39m(data: \u001b[38;5;28mstr\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m PointStruct:\n\u001b[0;32m---> 17\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mPointStruct\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m     18\u001b[0m \u001b[43m        \u001b[49m\u001b[38;5;28;43mid\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;28;43mstr\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43muuid\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43muuid4\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     19\u001b[0m \u001b[43m        \u001b[49m\u001b[43mvector\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m \u001b[49m\u001b[43membed_data\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdata\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     20\u001b[0m \u001b[43m        \u001b[49m\u001b[43mpayload\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m \u001b[49m\u001b[43m{\u001b[49m\n\u001b[1;32m     21\u001b[0m \u001b[43m            \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mraw\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mdata\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     22\u001b[0m \u001b[43m        \u001b[49m\u001b[43m}\u001b[49m\n\u001b[1;32m     23\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n",
-            "File \u001b[0;32m~/Projects/Topoteretes/cognee/.venv/lib/python3.12/site-packages/pydantic/main.py:171\u001b[0m, in \u001b[0;36mBaseModel.__init__\u001b[0;34m(self, **data)\u001b[0m\n\u001b[1;32m    169\u001b[0m \u001b[38;5;66;03m# `__tracebackhide__` tells pytest and some other tools to omit this function from tracebacks\u001b[39;00m\n\u001b[1;32m    170\u001b[0m __tracebackhide__ \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[0;32m--> 171\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m__pydantic_validator__\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvalidate_python\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdata\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mself_instance\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m)\u001b[49m\n",
-            "\u001b[0;31mValidationError\u001b[0m: 2 validation errors for PointStruct\nvector.list[float]\n  Input should be a valid list [type=list_type, input_value=None, input_type=NoneType]\n    For further information visit https://errors.pydantic.dev/2.6/v/list_type\nvector.dict[str,union[SparseVector,list[float]]]\n  Input should be a valid dictionary [type=dict_type, input_value=None, input_type=NoneType]\n    For further information visit https://errors.pydantic.dev/2.6/v/dict_type"
-          ]
-        }
-      ],
-      "source": [
-        "database_path = path.join(path.abspath(ROOT_DIR), \"database/data\", \"vector_retrieval_demo.db\")\n",
-        "\n",
-        "try:\n",
-        "    import shutil\n",
-        "    shutil.rmtree(database_path)\n",
-        "except Exception as exception:\n",
-        "    print(exception)\n",
-        "    pass\n",
-        "\n",
-        "def embed_data(data: str):\n",
-        "    embedding_engine = TextEmbedding(model_name = \"sentence-transformers/paraphrase-multilingual-mpnet-base-v2\")\n",
-        "    embedding_engine.embed(documents = [data])\n",
-        "\n",
-        "qdrant_client = QDrantAdapter(qdrant_path = database_path, qdrant_url = None, qdrant_api_key = None)\n",
-        "\n",
-        "def create_data_point(data: str) -> PointStruct:\n",
-        "    return PointStruct(\n",
-        "        id = str(uuid.uuid4()),\n",
-        "        vector = embed_data(data),\n",
-        "        payload = {\n",
-        "            \"raw\": data,\n",
-        "        }\n",
-        "    )\n",
-        "\n",
-        "data_points = map(create_data_point, articles)\n",
-        "\n",
-        "async def run():\n",
-        "    result = await qdrant_client.create_collection(\"test_collection\", CollectionConfig(\n",
-        "        vector_config = {\n",
-        "            \"size\": 1536,\n",
-        "            \"distance\": Distance.DOT\n",
-        "        }\n",
-        "    ))\n",
-        "    print(result)\n",
-        "\n",
-        "    result = await qdrant_client.create_data_points(\"test_collection\", data_points)\n",
-        "    print(result)\n",
-        "\n",
-        "await run()"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "4b6163a2",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "qdrant_client = QDrantAdapter(qdrant_path = database_path, qdrant_url = None, qdrant_api_key = None)\n",
-        "\n",
-        "query_vector = embed_data(\"Last emperor of India\")\n",
-        "\n",
-        "results = await qdrant_client.find_related_data_points(collection_name = \"test_collection\", query_vector = query_vector)\n",
-        "\n",
-        "import json\n",
-        "\n",
-        "for result in results:\n",
-        "    print(result.score)\n",
-        "\n",
-        "results[0].payload[\"raw\"]"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "3d18b00f",
-      "metadata": {},
-      "outputs": [],
-      "source": []
-    }
-  ],
-  "metadata": {
-    "kernelspec": {
-      "display_name": "Python 3 (ipykernel)",
-      "language": "python",
-      "name": "python3"
-    },
-    "language_info": {
-      "codemirror_mode": {
-        "name": "ipython",
-        "version": 3
-      },
-      "file_extension": ".py",
-      "mimetype": "text/x-python",
-      "name": "python",
-      "nbconvert_exporter": "python",
-      "pygments_lexer": "ipython3",
-      "version": "3.10.13"
-    }
-  },
-  "nbformat": 4,
-  "nbformat_minor": 5
-}
--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@ -18,7 +18,7 @@ classifiers = [

 [tool.poetry.dependencies]
 python = "~3.10"
-openai = "1.14.3"
+openai = "1.12.0"
 python-dotenv = "1.0.1"
 fastapi = "^0.109.2"
 uvicorn = "0.22.0"
@ -26,7 +26,7 @@ boto3 = "^1.26.125"
 gunicorn = "^20.1.0"
 sqlalchemy = "^2.0.21"
 asyncpg = "^0.28.0"
-instructor = "^0.6.6"
+instructor = "^0.3.4"
 networkx = "^3.2.1"
 graphviz = "^0.20.1"
 langdetect = "^1.0.9"
@ -34,23 +34,24 @@ debugpy = "^1.8.0"
 pyarrow = "^15.0.0"
 pylint = "^3.0.3"
 aiosqlite = "^0.20.0"
-unstructured = {extras = ["all-docs"], version = "^0.12.5"}
 pymupdf = "^1.23.25"
-pandas = "^2.2.1"
+pandas = "^2.2.0"
 greenlet = "^3.0.3"
 ruff = "^0.2.2"
 filetype = "^1.2.0"
 nltk = "^3.8.1"
-scikit-learn = "^1.4.1.post1"
 dlt = "^0.4.6"
 duckdb = {version = "^0.10.0", extras = ["dlt"]}
 overrides = "^7.7.0"
 aiofiles = "^23.2.1"
+qdrant-client = "^1.8.0"
 duckdb-engine = "^0.11.2"
 graphistry = "^0.33.5"
 tenacity = "^8.2.3"
 weaviate-client = "^4.5.4"
-qdrant-client = "^1.8.0"
+scikit-learn = "^1.4.1.post1"
+fastembed = "^0.2.5"
+pypdf = "^4.1.0"

 [tool.poetry.extras]
 dbt = ["dbt-core", "dbt-redshift", "dbt-bigquery", "dbt-duckdb", "dbt-snowflake", "dbt-athena-community", "dbt-databricks"]
@ -61,7 +62,6 @@ postgres = ["psycopg2-binary", "psycopg2cffi"]
 redshift = ["psycopg2-binary", "psycopg2cffi"]
 parquet = ["pyarrow"]
 duckdb = ["duckdb"]
-qdrant = ["qdrant-client"]
 filesystem = ["s3fs", "botocore"]
 s3 = ["s3fs", "botocore"]
 gs = ["gcsfs"]
@ -93,7 +93,6 @@ mkdocstrings = "^0.22.0"
 mkdocstrings-python = "^1.1.2"
 pytest-examples = "^0.0.10"
 mkdocs-jupyter = "^0.24.6"
-mkdocs-rss-plugin = "^1.12.0"
 mkdocs-minify-plugin = "^0.8.0"
 mkdocs-redirects = "^1.2.1"