cognee/cognitive_architecture/database/vectordb/vectordb.py
2023-12-16 15:25:32 +01:00

370 lines
14 KiB
Python

# Make sure to install the following packages: dlt, langchain, duckdb, python-dotenv, openai, weaviate-client
import logging
from langchain.text_splitter import RecursiveCharacterTextSplitter
from marshmallow import Schema, fields
from cognitive_architecture.database.vectordb.loaders.loaders import _document_loader
# Add the parent directory to sys.path
logging.basicConfig(level=logging.INFO)
from langchain.retrievers import WeaviateHybridSearchRetriever, ParentDocumentRetriever
from weaviate.gql.get import HybridFusion
import tracemalloc
tracemalloc.start()
import os
from langchain.embeddings.openai import OpenAIEmbeddings
from dotenv import load_dotenv
from langchain.schema import Document
import weaviate
load_dotenv()
from ...config import Config
config = Config()
config.load()
LTM_MEMORY_ID_DEFAULT = "00000"
ST_MEMORY_ID_DEFAULT = "0000"
BUFFER_ID_DEFAULT = "0000"
class VectorDB:
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
def __init__(
self,
user_id: str,
index_name: str,
memory_id: str,
namespace: str = None,
embeddings = None,
):
self.user_id = user_id
self.index_name = index_name
self.namespace = namespace
self.memory_id = memory_id
self.embeddings = embeddings
class PineconeVectorDB(VectorDB):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.init_pinecone(self.index_name)
def init_pinecone(self, index_name):
# Pinecone initialization logic
pass
import langchain.embeddings
class WeaviateVectorDB(VectorDB):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.init_weaviate(embeddings= self.embeddings, namespace = self.namespace)
def init_weaviate(self, embeddings=OpenAIEmbeddings(), namespace=None,retriever_type="",):
# Weaviate initialization logic
auth_config = weaviate.auth.AuthApiKey(
api_key=os.environ.get("WEAVIATE_API_KEY")
)
client = weaviate.Client(
url=os.environ.get("WEAVIATE_URL"),
auth_client_secret=auth_config,
additional_headers={"X-OpenAI-Api-Key": os.environ.get("OPENAI_API_KEY")},
)
if retriever_type == "single_document_context":
retriever = WeaviateHybridSearchRetriever(
client=client,
index_name=namespace,
text_key="text",
attributes=[],
embedding=embeddings,
create_schema_if_missing=True,
)
return retriever
elif retriever_type == "multi_document_context":
retriever = WeaviateHybridSearchRetriever(
client=client,
index_name=namespace,
text_key="text",
attributes=[],
embedding=embeddings,
create_schema_if_missing=True,
)
return retriever
else :
return client
# child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)
# store = InMemoryStore()
# retriever = ParentDocumentRetriever(
# vectorstore=vectorstore,
# docstore=store,
# child_splitter=child_splitter,
# )
from marshmallow import Schema, fields
def create_document_structure(observation, params, metadata_schema_class=None):
"""
Create and validate a document structure with optional custom fields.
:param observation: Content of the document.
:param params: Metadata information.
:param metadata_schema_class: Custom metadata schema class (optional).
:return: A list containing the validated document data.
"""
document_data = {
"metadata": params,
"page_content": observation
}
def get_document_schema():
class DynamicDocumentSchema(Schema):
metadata = fields.Nested(metadata_schema_class, required=True)
page_content = fields.Str(required=True)
return DynamicDocumentSchema
# Validate and deserialize, defaulting to "1.0" if not provided
CurrentDocumentSchema = get_document_schema()
loaded_document = CurrentDocumentSchema().load(document_data)
return [loaded_document]
def _stuct(self, observation, params, metadata_schema_class =None):
"""Utility function to create the document structure with optional custom fields."""
# Construct document data
document_data = {
"metadata": params,
"page_content": observation
}
def get_document_schema():
class DynamicDocumentSchema(Schema):
metadata = fields.Nested(metadata_schema_class, required=True)
page_content = fields.Str(required=True)
return DynamicDocumentSchema
# Validate and deserialize # Default to "1.0" if not provided
CurrentDocumentSchema = get_document_schema()
loaded_document = CurrentDocumentSchema().load(document_data)
return [loaded_document]
async def add_memories(self, observation, loader_settings=None, params=None, namespace=None, metadata_schema_class=None, embeddings = 'hybrid'):
# Update Weaviate memories here
if namespace is None:
namespace = self.namespace
retriever = self.init_weaviate(embeddings=OpenAIEmbeddings(),namespace = namespace, retriever_type="single_document_context")
if loader_settings:
# Assuming _document_loader returns a list of documents
documents = await _document_loader(observation, loader_settings)
logging.info("here are the docs %s", str(documents))
chunk_count = 0
for doc in documents[0]:
chunk_count += 1
params['chunk_order'] = chunk_count
# document_to_load = self._stuct(doc.page_content, params, metadata_schema_class)
# logging.info("Loading document with provided loader settings %s", str(document_to_load))
retriever.add_documents([
Document(metadata=params, page_content=doc.page_content)])
else:
chunk_count = 0
from cognitive_architecture.database.vectordb.chunkers.chunkers import chunk_data
documents = [chunk_data(chunk_strategy="VANILLA", source_data=observation, chunk_size=50,
chunk_overlap=20)]
for doc in documents[0]:
chunk_count += 1
params['chunk_order'] = chunk_count
# document_to_load = self._stuct(observation, params, metadata_schema_class)
logging.info("Loading document with defautl loader settings %s", str(doc))
# logging.info("Loading document with defautl loader settings %s", str(document_to_load))
retriever.add_documents([
Document(metadata=params, page_content=doc.page_content)])
async def fetch_memories(self, observation: str, namespace: str = None, search_type: str = 'hybrid', **kwargs):
"""
Fetch documents from weaviate.
Parameters:
- observation (str): User query.
- namespace (str, optional): Type of memory accessed.
- search_type (str, optional): Type of search ('text', 'hybrid', 'bm25', 'generate', 'generate_grouped'). Defaults to 'hybrid'.
- **kwargs: Additional parameters for flexibility.
Returns:
List of documents matching the query or an empty list in case of error.
Example:
fetch_memories(query="some query", search_type='text', additional_param='value')
"""
client = self.init_weaviate(namespace =self.namespace)
if search_type is None:
search_type = 'hybrid'
logging.info("The search type is s%", search_type)
if search_type == 'summary':
from weaviate.classes import Filter
client = weaviate.connect_to_wcs(
cluster_url=config.weaviate_url,
auth_credentials=weaviate.AuthApiKey(config.weaviate_api_key)
)
summary_collection = client.collections.get(self.namespace)
response = summary_collection.query.fetch_objects(
filters=Filter("user_id").equal(self.user_id) &
Filter("chunk_order").less_than(25),
limit=15
)
return response
if not namespace:
namespace = self.namespace
logging.info("Query on namespace %s", namespace)
params_user_id = {
"path": ["user_id"],
"operator": "Like",
"valueText": self.user_id,
}
def list_objects_of_class(class_name, schema):
return [
prop["name"]
for class_obj in schema["classes"]
if class_obj["class"] == class_name
for prop in class_obj["properties"]
]
base_query = client.query.get(
namespace, list(list_objects_of_class(namespace, client.schema.get()))
).with_additional(
["id", "creationTimeUnix", "lastUpdateTimeUnix", "score", 'distance']
).with_where(params_user_id).with_limit(10)
n_of_observations = kwargs.get('n_of_observations', 2)
# try:
if search_type == 'text':
query_output = (
base_query
.with_near_text({"concepts": [observation]})
.with_autocut(n_of_observations)
.do()
)
elif search_type == 'hybrid':
query_output = (
base_query
.with_hybrid(query=observation, fusion_type=HybridFusion.RELATIVE_SCORE)
.with_autocut(n_of_observations)
.do()
)
elif search_type == 'bm25':
query_output = (
base_query
.with_bm25(query=observation)
.with_autocut(n_of_observations)
.do()
)
elif search_type == 'generate':
generate_prompt = kwargs.get('generate_prompt', "")
query_output = (
base_query
.with_generate(single_prompt=observation)
.with_near_text({"concepts": [observation]})
.with_autocut(n_of_observations)
.do()
)
elif search_type == 'generate_grouped':
generate_prompt = kwargs.get('generate_prompt', "")
query_output = (
base_query
.with_generate(grouped_task=observation)
.with_near_text({"concepts": [observation]})
.with_autocut(n_of_observations)
.do()
)
else:
logging.error(f"Invalid search_type: {search_type}")
return []
# except Exception as e:
# logging.error(f"Error executing query: {str(e)}")
# return []
return query_output
async def delete_memories(self, namespace:str, params: dict = None):
if namespace is None:
namespace = self.namespace
client = self.init_weaviate(namespace = self.namespace)
if params:
where_filter = {
"path": ["id"],
"operator": "Equal",
"valueText": params.get("id", None),
}
return client.batch.delete_objects(
class_name=self.namespace,
# Same `where` filter as in the GraphQL API
where=where_filter,
)
else:
# Delete all objects
return client.batch.delete_objects(
class_name=namespace,
where={
"path": ["version"],
"operator": "Equal",
"valueText": "1.0",
},
)
async def count_memories(self, namespace: str = None, params: dict = None) -> int:
"""
Count memories in a Weaviate database.
Args:
namespace (str, optional): The Weaviate namespace to count memories in. If not provided, uses the default namespace.
Returns:
int: The number of memories in the specified namespace.
"""
if namespace is None:
namespace = self.namespace
client = self.init_weaviate(namespace =namespace)
try:
object_count = client.query.aggregate(namespace).with_meta_count().do()
return object_count
except Exception as e:
logging.info(f"Error counting memories: {str(e)}")
# Handle the error or log it
return 0
def update_memories(self, observation, namespace: str, params: dict = None):
client = self.init_weaviate(namespace = self.namespace)
client.data_object.update(
data_object={
# "text": observation,
"user_id": str(self.user_id),
"version": params.get("version", None) or "",
"agreement_id": params.get("agreement_id", None) or "",
"privacy_policy": params.get("privacy_policy", None) or "",
"terms_of_service": params.get("terms_of_service", None) or "",
"format": params.get("format", None) or "",
"schema_version": params.get("schema_version", None) or "",
"checksum": params.get("checksum", None) or "",
"owner": params.get("owner", None) or "",
"license": params.get("license", None) or "",
"validity_start": params.get("validity_start", None) or "",
"validity_end": params.get("validity_end", None) or ""
# **source_metadata,
},
class_name="Test",
uuid=params.get("id", None),
consistency_level=weaviate.data.replication.ConsistencyLevel.ALL, # default QUORUM
)
return