From 8113787cce2e9170916108d0025dfdbf19cbe4ce Mon Sep 17 00:00:00 2001 From: Vasilije <8619304+Vasilije1990@users.noreply.github.com> Date: Thu, 7 Sep 2023 14:42:31 +0200 Subject: [PATCH 1/5] added initial version of the code for the modulator auto eval logic --- level_2/api.py | 29 + level_2/auth/auth.py | 35 + level_2/auth/auth_utils.py | 62 ++ level_2/auth/cognito/JWTBearer.py | 72 ++ .../level_2_pdf_vectorstore__dlt_contracts.py | 279 +++++-- level_2/poetry.lock | 734 +++++++++++++++++- level_2/pyproject.toml | 1 + .../crud_test.cpython-311-pytest-7.4.0.pyc | Bin 0 -> 4301 bytes ...emantic_tests.cpython-311-pytest-7.4.0.pyc | Bin 0 -> 2291 bytes level_2/tests/base_test_set.json | 38 + level_2/tests/crud_test.py | 2 + level_2/tests/semantic_tests.py | 119 +++ 12 files changed, 1286 insertions(+), 85 deletions(-) create mode 100644 level_2/auth/auth.py create mode 100644 level_2/auth/auth_utils.py create mode 100644 level_2/auth/cognito/JWTBearer.py create mode 100644 level_2/tests/__pycache__/crud_test.cpython-311-pytest-7.4.0.pyc create mode 100644 level_2/tests/__pycache__/semantic_tests.cpython-311-pytest-7.4.0.pyc create mode 100644 level_2/tests/base_test_set.json create mode 100644 level_2/tests/semantic_tests.py diff --git a/level_2/api.py b/level_2/api.py index 8ddc33686..ab89ac217 100644 --- a/level_2/api.py +++ b/level_2/api.py @@ -216,6 +216,35 @@ async def create_context( except Exception as e: return JSONResponse(content={"response": {"error": str(e)}}, status_code=503) + +@app.post("/buffer/provide-feedback", response_model=dict) +async def provide_feedback( + payload: Payload, + # files: List[UploadFile] = File(...), +): + try: + decoded_payload = payload.payload + + Memory_ = Memory(user_id=decoded_payload["user_id"]) + + await Memory_.async_init() + + # memory_class = getattr(Memory_, f"_delete_{memory_type}_memory", None) + if decoded_payload["total_score"] is None: + + output = await Memory_._provide_feedback( + user_input=decoded_payload["prompt"], params=decoded_payload["params"], attention_modulators=None, total_score=decoded_payload["total_score"] + ) + return JSONResponse(content={"response": output}, status_code=200) + else: + output = await Memory_._provide_feedback( + user_input=decoded_payload["prompt"], params=decoded_payload["params"], attention_modulators=decoded_payload["attention_modulators"], total_score=None + ) + return JSONResponse(content={"response": output}, status_code=200) + + + except Exception as e: + return JSONResponse(content={"response": {"error": str(e)}}, status_code=503) def start_api_server(host: str = "0.0.0.0", port: int = 8000): """ Start the API server using uvicorn. diff --git a/level_2/auth/auth.py b/level_2/auth/auth.py new file mode 100644 index 000000000..7c8a19005 --- /dev/null +++ b/level_2/auth/auth.py @@ -0,0 +1,35 @@ +import os + +import requests +from dotenv import load_dotenv +from fastapi import Depends, HTTPException +from starlette.status import HTTP_403_FORBIDDEN + +from auth.cognito.JWTBearer import JWKS, JWTBearer, JWTAuthorizationCredentials + +load_dotenv() # Automatically load environment variables from a '.env' file. + +# jwks = JWKS.parse_obj( +# requests.get( +# f"https://cognito-idp.{os.environ.get('eu-west-1:46372257029')}.amazonaws.com/" +# f"{os.environ.get('eu-west-1_3VUqKzMgj')}/.well-known/jwks.json" +# ).json() +# ) +# Construct the Cognito User Pool URL using the correct syntax +region = "eu-west-1" +user_pool_id = "eu-west-1_viUyNCqKp" +cognito_url = f"https://cognito-idp.{region}.amazonaws.com/{user_pool_id}/.well-known/jwks.json" + +# Fetch the JWKS using the updated URL +jwks = JWKS.parse_obj(requests.get(cognito_url).json()) + +auth = JWTBearer(jwks) + + +async def get_current_user( + credentials: JWTAuthorizationCredentials = Depends(auth) +) -> str: + try: + return credentials.claims["username"] + except KeyError: + HTTPException(status_code=HTTP_403_FORBIDDEN, detail="Username missing") diff --git a/level_2/auth/auth_utils.py b/level_2/auth/auth_utils.py new file mode 100644 index 000000000..41a1b09c6 --- /dev/null +++ b/level_2/auth/auth_utils.py @@ -0,0 +1,62 @@ + +from cognito.JWTBearer import JWKS, JWTBearer, JWTAuthorizationCredentials + +import requests + +region = "eu-west-1" +user_pool_id = "" #needed +cognito_url = f"https://cognito-idp.{region}.amazonaws.com/{user_pool_id}/.well-known/jwks.json" + +# Fetch the JWKS using the updated URL +jwks = JWKS.parse_obj(requests.get(cognito_url).json()) +print(jwks) + +auth = JWTBearer(jwks) + + +import requests + +# Set the Cognito authentication endpoint URL + +auth = JWTBearer(jwks) + +# Set the user credentials + +username = "" #needed +password = "" #needed + +# Create the authentication payload +payload = { + "username": username, + "password": password +} + +# Set the Cognito authentication endpoint URL +# Set the Cognito token endpoint URL +token_endpoint = f"https://your-cognito-domain.auth.{region}.amazoncognito.com/oauth2/token" + +# Set the client credentials +client_id = "" #needed +client_secret = "" + +import boto3 +def authenticate_and_get_token(username: str, password: str, + user_pool_id: str, app_client_id: str) -> None: + client = boto3.client('cognito-idp') + + resp = client.admin_initiate_auth( + UserPoolId=user_pool_id, + ClientId=app_client_id, + AuthFlow='ADMIN_NO_SRP_AUTH', + AuthParameters={ + "USERNAME": username, + "PASSWORD": password + } + ) + + print("Log in success") + print("Access token:", resp['AuthenticationResult']['AccessToken']) + print("ID token:", resp['AuthenticationResult']['IdToken']) + + +authenticate_and_get_token(username, password, user_pool_id, client_id) \ No newline at end of file diff --git a/level_2/auth/cognito/JWTBearer.py b/level_2/auth/cognito/JWTBearer.py new file mode 100644 index 000000000..aa5f6a81d --- /dev/null +++ b/level_2/auth/cognito/JWTBearer.py @@ -0,0 +1,72 @@ +from typing import Dict, Optional, List + +from fastapi import HTTPException +from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials +from jose import jwt, jwk, JWTError +from jose.utils import base64url_decode +from pydantic import BaseModel +from starlette.requests import Request +from starlette.status import HTTP_403_FORBIDDEN + +JWK = Dict[str, str] + + +class JWKS(BaseModel): + keys: List[JWK] + + +class JWTAuthorizationCredentials(BaseModel): + jwt_token: str + header: Dict[str, str] + claims: Dict[str, str] + signature: str + message: str + + +class JWTBearer(HTTPBearer): + def __init__(self, jwks: JWKS, auto_error: bool = True): + super().__init__(auto_error=auto_error) + + self.kid_to_jwk = {jwk["kid"]: jwk for jwk in jwks.keys} + + def verify_jwk_token(self, jwt_credentials: JWTAuthorizationCredentials) -> bool: + try: + public_key = self.kid_to_jwk[jwt_credentials.header["kid"]] + except KeyError: + raise HTTPException( + status_code=HTTP_403_FORBIDDEN, detail="JWK public key not found" + ) + + key = jwk.construct(public_key) + decoded_signature = base64url_decode(jwt_credentials.signature.encode()) + + return key.verify(jwt_credentials.message.encode(), decoded_signature) + + async def __call__(self, request: Request) -> Optional[JWTAuthorizationCredentials]: + credentials: HTTPAuthorizationCredentials = await super().__call__(request) + + if credentials: + if not credentials.scheme == "Bearer": + raise HTTPException( + status_code=HTTP_403_FORBIDDEN, detail="Wrong authentication method" + ) + + jwt_token = credentials.credentials + + message, signature = jwt_token.rsplit(".", 1) + + try: + jwt_credentials = JWTAuthorizationCredentials( + jwt_token=jwt_token, + header=jwt.get_unverified_header(jwt_token), + claims=jwt.get_unverified_claims(jwt_token), + signature=signature, + message=message, + ) + except JWTError: + raise HTTPException(status_code=HTTP_403_FORBIDDEN, detail="JWK invalid") + + if not self.verify_jwk_token(jwt_credentials): + raise HTTPException(status_code=HTTP_403_FORBIDDEN, detail="JWK invalid") + + return jwt_credentials \ No newline at end of file diff --git a/level_2/level_2_pdf_vectorstore__dlt_contracts.py b/level_2/level_2_pdf_vectorstore__dlt_contracts.py index 42892c2bb..645de2b2f 100644 --- a/level_2/level_2_pdf_vectorstore__dlt_contracts.py +++ b/level_2/level_2_pdf_vectorstore__dlt_contracts.py @@ -15,11 +15,12 @@ from langchain.retrievers import WeaviateHybridSearchRetriever from langchain.tools import tool from marvin import ai_classifier from pydantic import parse_obj_as - +from weaviate.gql.get import HybridFusion +import numpy as np load_dotenv() from langchain import OpenAI from langchain.chat_models import ChatOpenAI -from typing import Optional +from typing import Optional, Dict, List, Union import tracemalloc @@ -77,6 +78,35 @@ ST_MEMORY_ID_DEFAULT = "0000" BUFFER_ID_DEFAULT = "0000" +class DifferentiableLayer: + def __init__(self, attention_modulators: dict): + self.weights = {modulator: 1.0 for modulator in attention_modulators} + self.learning_rate = 0.1 + self.regularization_lambda = 0.01 + self.weight_decay = 0.99 + + async def adjust_weights(self, feedbacks: list[float]): + """ + Adjusts the weights of the attention modulators based on user feedbacks. + + Parameters: + - feedbacks: A list of feedback scores (between 0 and 1). + """ + avg_feedback = np.mean(feedbacks) + feedback_diff = 1.0 - avg_feedback + + # Adjust weights based on average feedback + for modulator in self.weights: + self.weights[modulator] += self.learning_rate * (-feedback_diff) - self.regularization_lambda * \ + self.weights[modulator] + self.weights[modulator] *= self.weight_decay + + # Decaying the learning rate + self.learning_rate *= 0.99 + + async def get_weights(self): + return self.weights + class VectorDBFactory: def create_vector_db( self, @@ -352,6 +382,7 @@ class WeaviateVectorDB(VectorDB): ) .with_hybrid( query=observation, + fusion_type=HybridFusion.RELATIVE_SCORE ) .with_autocut(1) .with_where(params_user_id) @@ -452,7 +483,8 @@ class BaseMemory: ): if self.db_type == "weaviate": return await self.vector_db.add_memories( - observation=observation, loader_settings=loader_settings, params=params, namespace=namespace + observation=observation, loader_settings=loader_settings, + params=params, namespace=namespace ) # Add other db_type conditions if necessary @@ -464,7 +496,8 @@ class BaseMemory: ): if self.db_type == "weaviate": return await self.vector_db.fetch_memories( - observation=observation, params=params, namespace=namespace + observation=observation, params=params, + namespace=namespace ) async def delete_memories(self, params: Optional[str] = None): @@ -483,8 +516,7 @@ class SemanticMemory(BaseMemory): db_type: str = "weaviate", ): super().__init__( - user_id, memory_id, index_name, db_type, namespace="SEMANTICMEMORY" - ) + user_id, memory_id, index_name, db_type, namespace="SEMANTICMEMORY") class EpisodicMemory(BaseMemory): @@ -580,12 +612,66 @@ class EpisodicBuffer(BaseMemory): ) return [str(frequency), result_output["data"]["Get"]["EPISODICMEMORY"][0]] - async def relevance(self, observation: str, namespace: str) -> list[str]: - """Relevance - Score between 0 and 1 on how often was the final information relevant to the user in the past. - Stored in the episodic memory, mainly to show how well a buffer did the job - Starts at 0, gets updated based on the user feedback""" + async def repetition(self, observation: str, namespace: str) -> list[str]: + """Repetition - Score between 0 and 1 based on how often and at what intervals a memory has been revisited. + Accounts for the spacing effect, where memories accessed at increasing intervals are given higher scores. + """ + weaviate_client = self.init_client(namespace=namespace) - return ["0", "memory"] + result_output = await self.fetch_memories( + observation=observation, params=None, namespace=namespace + ) + + access_times = result_output["data"]["Get"]["EPISODICMEMORY"][0]["_additional"]["accessTimes"] + # Calculate repetition score based on access times + if not access_times or len(access_times) == 1: + return ["0", result_output["data"]["Get"]["EPISODICMEMORY"][0]] + + # Sort access times + access_times = sorted(access_times) + # Calculate intervals between consecutive accesses + intervals = [access_times[i + 1] - access_times[i] for i in range(len(access_times) - 1)] + # A simple scoring mechanism: Longer intervals get higher scores, as they indicate spaced repetition + repetition_score = sum([1.0 / (interval + 1) for interval in intervals]) / len(intervals) + + return [str(repetition_score), result_output["data"]["Get"]["EPISODICMEMORY"][0]] + + async def relevance(self, observation: str, namespace: str) -> list[str]: + """ + Fetches the relevance score for a given observation from the episodic memory. + + Parameters: + - observation: The user's query or observation. + - namespace: The namespace for the data. + + Returns: + - The relevance score between 0 and 1. + """ + + # Fetch the memory content based on the observation + result_output = await self.fetch_memories( + observation=observation, params=None, namespace=namespace + ) + + # Extract the relevance score from the memory content + score = result_output["data"]["Get"]["EPISODICMEMORY"][0]["_additional"]["score"] + + return score + + + #each of the requests is numbered, and then the previous requests are retrieved . The request is classified based on past and current content as : + # 1. Very positive request + # 2. Positive request + # 3. Neutral request + # 4. Negative request + # 5. Very negative request + + + # After this, we update the weights of the request based on the classification of the request. + # After updating the weights, we update the buffer with the new weights. When new weights are calculated, we start from the updated values + # Which chunking strategy works best? + + # Adding to the buffer - process the weights, and then use them as filters async def saliency(self, observation: str, namespace=None) -> list[str]: """Determines saliency by scoring the set of retrieved documents against each other and trying to determine saliency @@ -617,6 +703,19 @@ class EpisodicBuffer(BaseMemory): + + # Example Usage + # attention_modulators = {"freshness": 0.8, "frequency": 0.7, "relevance": 0.9, "saliency": 0.85} + # diff_layer = DifferentiableLayer(attention_modulators) + # + # # Sample batch feedback + # feedbacks = [0.75, 0.8, 0.9] + # + # # Adjust weights based on batch feedback + # diff_layer.adjust_weights(feedbacks) + # + # print(diff_layer.get_weights()) + async def handle_modulator( self, modulator_name: str, @@ -679,12 +778,6 @@ class EpisodicBuffer(BaseMemory): attention_modulators: dict = None, ): """Generates the context to be used for the buffer and passed to the agent""" - try: - # we delete all memories in the episodic buffer, so we can start fresh - await self.delete_memories() - except: - # in case there are no memories, we pass - pass # we just filter the data here to make sure input is clean prompt_filter = ChatPromptTemplate.from_template( """Filter and remove uneccessary information that is not relevant in the query to @@ -695,40 +788,106 @@ class EpisodicBuffer(BaseMemory): # this part is partially done but the idea is to apply different attention modulators # to the data to fetch the most relevant information from the vector stores - context = [] - if attention_modulators: - from typing import Optional, Dict, List, Union + class BufferModulators(BaseModel): + """Value of buffer modulators""" + frequency: str = Field(..., description="Frequency score of the document") + saliency: str = Field(..., description="Saliency score of the document") + relevance: str = Field(..., description="Relevance score of the document") + description:str = Field(..., description="Latest buffer modulators") + direction: str= Field(..., description="Increase or a decrease of the modulator") - lookup_value_semantic = await self.fetch_memories( - observation=str(output), namespace="SEMANTICMEMORY" - ) - context = [] - for memory in lookup_value_semantic["data"]["Get"]["SEMANTICMEMORY"]: - # extract memory id, and pass it to fetch function as a parameter - modulators = list(attention_modulators.keys()) - for modulator in modulators: - result = await self.handle_modulator( - modulator, - attention_modulators, - str(output), - namespace="EPISODICMEMORY", + parser = PydanticOutputParser(pydantic_object=BufferModulators) + + prompt = PromptTemplate( + template="""Structure the buffer modulators to be used for the buffer. \n + {format_instructions} \nOriginal observation is: + {query}\n """, + input_variables=["query"], + partial_variables={"format_instructions": parser.get_format_instructions()}, + ) + + # check if modulators exist, initialize the modulators if needed + if attention_modulators is None: + try: + attention_modulators = await self.fetch_memories(observation="Attention modulators", + namespace="BUFFERMEMORY") + lookup_value_episodic = await self.fetch_memories( + observation=str(output), namespace="EPISODICCMEMORY" + ) + prompt_classify = ChatPromptTemplate.from_template( + """You are a classifier. Determine if based on the previous query if the user was satisfied with the output : {query}""" + ) + json_structure = { + "name": "classifier", + "description": "Classification indicating if it's output is satisfactory", + "type": "boolean", + "required": True + } + chain_filter = prompt_classify | self.llm.bind(function_call= {"name": "classifier"}, functions= json_structure) + classifier_output = await chain_filter.ainvoke({"query": lookup_value_episodic}) + arguments_str = classifier_output.additional_kwargs['function_call']['arguments'] + arguments_dict = json.loads(arguments_str) + classfier_value = arguments_dict.get('classifier', None) + + if classfier_value: + # adjust the weights of the modulators by adding a positive value + prompt_classify = ChatPromptTemplate.from_template( + """ We know we need to increase the classifiers for our AI system. The classifiers are {modulators} The query is: {query}. Which of the classifiers should we decrease? Return just the modulator and desired value""" ) - if result: - context.append(result) - context.append(memory) - else: - # defaults to semantic search if we don't want to apply algorithms on the vectordb data - lookup_value_episodic = await self.fetch_memories( - observation=str(output), namespace="EPISODICMEMORY" - ) - lookup_value_semantic = await self.fetch_memories( - observation=str(output), namespace="SEMANTICMEMORY" - ) - lookup_value_buffer = await self.fetch_memories(observation=str(output)) + chain_modulator = prompt_classify | self.llm + classifier_output = await chain_modulator.ainvoke({"query": lookup_value_episodic, "modulators": str(attention_modulators)}) + diff_layer = DifferentiableLayer(attention_modulators) + adjusted_modulator = diff_layer.adjust_weights(classifier_output) + _input = prompt.format_prompt(query=adjusted_modulator) + document_context_result = self.llm_base(_input.to_string()) + document_context_result_parsed = parser.parse(document_context_result) + await self.add_memories(observation=document_context_result_parsed, namespace="BUFFERMEMORY") + else: + # adjust the weights of the modulators by adding a negative value + prompt_classify = ChatPromptTemplate.from_template( + """ We know we need to decrease the classifiers for our AI system. The classifiers are {modulators} The query is: {query}. Which of the classifiers should we decrease? Return just the modulator and desired value""" + ) + chain_modulator_reduction = prompt_classify | self.llm + + classifier_output = await chain_modulator_reduction.ainvoke({"query": lookup_value_episodic, "modulators": str(attention_modulators)}) + diff_layer = DifferentiableLayer(attention_modulators) + adjusted_modulator =diff_layer.adjust_weights(classifier_output) + _input = prompt.format_prompt(query=adjusted_modulator) + document_context_result = self.llm_base(_input.to_string()) + document_context_result_parsed = parser.parse(document_context_result) + await self.add_memories(observation=document_context_result_parsed, namespace="BUFFERMEMORY") + except: + # initialize the modulators with default values if they are not provided + print("Starting with default modulators") + attention_modulators = { + "freshness": 0.5, + "frequency": 0.5, + "relevance": 0.5, + "saliency": 0.5, + } + + elif attention_modulators: + pass + + + lookup_value_semantic = await self.fetch_memories( + observation=str(output), namespace="SEMANTICMEMORY" + ) + context = [] + for memory in lookup_value_semantic["data"]["Get"]["SEMANTICMEMORY"]: + # extract memory id, and pass it to fetch function as a parameter + modulators = list(attention_modulators.keys()) + for modulator in modulators: + result = await self.handle_modulator( + modulator, + attention_modulators, + str(output), + namespace="EPISODICMEMORY", + ) + if result: + context.append(result) + context.append(memory) - context.append(lookup_value_buffer) - context.append(lookup_value_semantic) - context.append(lookup_value_episodic) class BufferModulators(BaseModel): frequency: str = Field(..., description="Frequency score of the document") @@ -939,12 +1098,7 @@ class EpisodicBuffer(BaseMemory): result_tasks.append(task) result_tasks.append(output) - # print("HERE IS THE RESULT TASKS", str(result_tasks)) - # # buffer_result = await self.fetch_memories(observation=str(user_input)) - # - # print("HERE IS THE RESULT TASKS", str(buffer_result)) - class EpisodicTask(BaseModel): """Schema for an individual task.""" @@ -972,18 +1126,19 @@ class EpisodicBuffer(BaseMemory): user_query: str = Field( ..., description="The order at which the task needs to be performed" ) - + attention_modulators: str = Field(..., description="List of attention modulators") parser = PydanticOutputParser(pydantic_object=EpisodicList) + date = datetime.now().strftime('%Y-%m-%d %H:%M:%S') prompt = PromptTemplate( - template="Format the result.\n{format_instructions}\nOriginal query is: {query}\n Steps are: {steps}, buffer is: {buffer}", - input_variables=["query", "steps", "buffer"], + template="Format the result.\n{format_instructions}\nOriginal query is: {query}\n Steps are: {steps}, buffer is: {buffer}, date is:{date}, attention modulators are: {attention_modulators} \n", + input_variables=["query", "steps", "buffer", "date", "attention_modulators"], partial_variables={"format_instructions": parser.get_format_instructions()}, ) _input = prompt.format_prompt( query=user_input, steps=str(tasks_list) - , buffer=str(result_tasks) + , buffer=str(result_tasks), date= date, attention_modulators=attention_modulators ) # return "a few things to do like load episodic memory in a structured format" @@ -992,8 +1147,7 @@ class EpisodicBuffer(BaseMemory): lookup_value = await self.add_memories( observation=str(result_parsing.json()), params=params, namespace='EPISODICMEMORY' ) - # print("THE RESULT OF THIS QUERY IS ", result_parsing.json()) - await self.delete_memories() + # await self.delete_memories() return result_parsing.json() @@ -1188,6 +1342,9 @@ class Memory: async def _available_operations(self): return await self.long_term_memory.episodic_buffer.available_operations() + async def _provide_feedback(self, score:str =None, params: dict = None, attention_modulators: dict = None): + return await self.short_term_memory.episodic_buffer.provide_feedback(score=score, params=params, attention_modulators=attention_modulators) + async def main(): @@ -1212,8 +1369,8 @@ async def main(): "source": "url", "path": "https://www.ibiblio.org/ebooks/London/Call%20of%20Wild.pdf" } - load_jack_london = await memory._add_semantic_memory(observation = "bla", loader_settings=loader_settings, params=params) - print(load_jack_london) + # load_jack_london = await memory._add_semantic_memory(observation = "bla", loader_settings=loader_settings, params=params) + # print(load_jack_london) modulator = {"relevance": 0.0, "saliency": 0.0, "frequency": 0.0} # # diff --git a/level_2/poetry.lock b/level_2/poetry.lock index 8233ad038..ba0c3e970 100644 --- a/level_2/poetry.lock +++ b/level_2/poetry.lock @@ -261,17 +261,17 @@ numpy = {version = ">=1.19.0", markers = "python_version >= \"3.9\""} [[package]] name = "boto3" -version = "1.28.32" +version = "1.28.37" description = "The AWS SDK for Python" optional = false python-versions = ">= 3.7" files = [ - {file = "boto3-1.28.32-py3-none-any.whl", hash = "sha256:ed787f250ce2562c7744395bdf32b5a7bc9184126ef50a75e97bcb66043dccf3"}, - {file = "boto3-1.28.32.tar.gz", hash = "sha256:b505faa126db84e226f6f8d242a798fae30a725f0cac8a76c6aca9ace4e8eb28"}, + {file = "boto3-1.28.37-py3-none-any.whl", hash = "sha256:709cf438ad3ea48d426e4659538fe1148fc2719469b52179d07a11c5d26abac6"}, + {file = "boto3-1.28.37.tar.gz", hash = "sha256:4aec1b54ba6cd352abba2cdd7cdc76e631a4d3ce79c55c0719f85f9c9842e4a2"}, ] [package.dependencies] -botocore = ">=1.31.32,<1.32.0" +botocore = ">=1.31.37,<1.32.0" jmespath = ">=0.7.1,<2.0.0" s3transfer = ">=0.6.0,<0.7.0" @@ -280,13 +280,13 @@ crt = ["botocore[crt] (>=1.21.0,<2.0a0)"] [[package]] name = "botocore" -version = "1.31.32" +version = "1.31.37" description = "Low-level, data-driven core of boto 3." optional = false python-versions = ">= 3.7" files = [ - {file = "botocore-1.31.32-py3-none-any.whl", hash = "sha256:8992ac186988c4b4cc168e8e479e9472da1442b193c1bf7c9dcd1877ec62d23c"}, - {file = "botocore-1.31.32.tar.gz", hash = "sha256:7a07d8dc8cc47bf23af39409ada81f388eb78233e1bb2cde0c415756da753664"}, + {file = "botocore-1.31.37-py3-none-any.whl", hash = "sha256:72e10759be3dff39c5eeb29f85c11a227c369c946d044f2caf62c352d6a6fc06"}, + {file = "botocore-1.31.37.tar.gz", hash = "sha256:5c92c8bc3c6b49950c95501b30f0ac551fd4952359b53a6fba243094028157de"}, ] [package.dependencies] @@ -662,6 +662,32 @@ ai = ["openai (>=0.27.6,<0.28.0)"] docx = ["docx2txt (>=0.8,<0.9)"] pdf = ["pypdf (>=3.3.0,<4.0.0)"] +[[package]] +name = "deepeval" +version = "0.10.12" +description = "Deep eval provides evaluation platform to accelerate development of LLMs and Agents" +optional = false +python-versions = "*" +files = [ + {file = "deepeval-0.10.12-py3-none-any.whl", hash = "sha256:239eb720e8a205afab1ae2425e483177bd76cde658bdac98658a6559bdba4f3f"}, + {file = "deepeval-0.10.12.tar.gz", hash = "sha256:80968d57a9da6c4fce6247d31ebf7fea228c76393e0d985804be68b722090732"}, +] + +[package.dependencies] +protobuf = "<=3.20.5" +pytest = "*" +requests = "*" +rich = "*" +sentence-transformers = "*" +tabulate = "*" +tqdm = "*" +transformers = "*" +typer = "*" + +[package.extras] +bias = ["Dbias", "tensorflow"] +toxic = ["detoxify"] + [[package]] name = "deprecated" version = "1.2.14" @@ -681,13 +707,13 @@ dev = ["PyTest", "PyTest-Cov", "bump2version (<1)", "sphinx (<2)", "tox"] [[package]] name = "dlt" -version = "0.3.12" +version = "0.3.14" description = "DLT is an open-source python-native scalable data loading framework that does not require any devops efforts to run." optional = false python-versions = ">=3.8.1,<4.0" files = [ - {file = "dlt-0.3.12-py3-none-any.whl", hash = "sha256:f9695a7fb98f5802e34f3f27d64e6ec1c0ccede48f26a1fdcd2db2c9280de9ac"}, - {file = "dlt-0.3.12.tar.gz", hash = "sha256:1fb4a947b2215627c1ee5725f2af80c6c7fcab60e7c3a57d49ab84b495444117"}, + {file = "dlt-0.3.14-py3-none-any.whl", hash = "sha256:b7672e153065796d0e7b0bc7eacfc48feff32a28e091eeca30f5a7180e42da2c"}, + {file = "dlt-0.3.14.tar.gz", hash = "sha256:b398ee07a1b87a6ac93130fc8e143d77e99a30d1bf957468d0252f23f563c01e"}, ] [package.dependencies] @@ -732,6 +758,7 @@ gs = ["gcsfs (>=2022.4.0)"] motherduck = ["duckdb (>=0.6.1,<0.9.0)", "pyarrow (>=8.0.0)"] parquet = ["pyarrow (>=8.0.0)"] postgres = ["psycopg2-binary (>=2.9.1)", "psycopg2cffi (>=2.9.0)"] +pydantic = ["pydantic (>=1.10,<2.0)"] redshift = ["psycopg2-binary (>=2.9.1)", "psycopg2cffi (>=2.9.0)"] s3 = ["boto3 (>=1.25)", "s3fs (>=2022.4.0)"] snowflake = ["snowflake-connector-python[pandas] (>=2.9.0)"] @@ -881,6 +908,24 @@ files = [ [package.extras] devel = ["colorama", "json-spec", "jsonschema", "pylint", "pytest", "pytest-benchmark", "pytest-cache", "validictory"] +[[package]] +name = "filelock" +version = "3.12.3" +description = "A platform independent file lock." +optional = false +python-versions = ">=3.8" +files = [ + {file = "filelock-3.12.3-py3-none-any.whl", hash = "sha256:f067e40ccc40f2b48395a80fcbd4728262fab54e232e090a4063ab804179efeb"}, + {file = "filelock-3.12.3.tar.gz", hash = "sha256:0ecc1dd2ec4672a10c8550a8182f1bd0c0a5088470ecd5a125e45f49472fac3d"}, +] + +[package.dependencies] +typing-extensions = {version = ">=4.7.1", markers = "python_version < \"3.11\""} + +[package.extras] +docs = ["furo (>=2023.7.26)", "sphinx (>=7.1.2)", "sphinx-autodoc-typehints (>=1.24)"] +testing = ["covdefaults (>=2.3)", "coverage (>=7.3)", "diff-cover (>=7.7)", "pytest (>=7.4)", "pytest-cov (>=4.1)", "pytest-mock (>=3.11.1)", "pytest-timeout (>=2.1)"] + [[package]] name = "frozenlist" version = "1.4.0" @@ -1232,6 +1277,38 @@ cli = ["click (==8.*)", "pygments (==2.*)", "rich (>=10,<14)"] http2 = ["h2 (>=3,<5)"] socks = ["socksio (==1.*)"] +[[package]] +name = "huggingface-hub" +version = "0.16.4" +description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" +optional = false +python-versions = ">=3.7.0" +files = [ + {file = "huggingface_hub-0.16.4-py3-none-any.whl", hash = "sha256:0d3df29932f334fead024afc7cb4cc5149d955238b8b5e42dcf9740d6995a349"}, + {file = "huggingface_hub-0.16.4.tar.gz", hash = "sha256:608c7d4f3d368b326d1747f91523dbd1f692871e8e2e7a4750314a2dd8b63e14"}, +] + +[package.dependencies] +filelock = "*" +fsspec = "*" +packaging = ">=20.9" +pyyaml = ">=5.1" +requests = "*" +tqdm = ">=4.42.1" +typing-extensions = ">=3.7.4.3" + +[package.extras] +all = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "black (>=23.1,<24.0)", "gradio", "jedi", "mypy (==0.982)", "numpy", "pydantic", "pytest", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-vcr", "pytest-xdist", "ruff (>=0.0.241)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "urllib3 (<2.0)"] +cli = ["InquirerPy (==0.3.4)"] +dev = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "black (>=23.1,<24.0)", "gradio", "jedi", "mypy (==0.982)", "numpy", "pydantic", "pytest", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-vcr", "pytest-xdist", "ruff (>=0.0.241)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "urllib3 (<2.0)"] +fastai = ["fastai (>=2.4)", "fastcore (>=1.3.27)", "toml"] +inference = ["aiohttp", "pydantic"] +quality = ["black (>=23.1,<24.0)", "mypy (==0.982)", "ruff (>=0.0.241)"] +tensorflow = ["graphviz", "pydot", "tensorflow"] +testing = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "gradio", "jedi", "numpy", "pydantic", "pytest", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-vcr", "pytest-xdist", "soundfile", "urllib3 (<2.0)"] +torch = ["torch"] +typing = ["pydantic", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3"] + [[package]] name = "humanize" version = "4.8.0" @@ -1414,13 +1491,13 @@ data = ["language-data (>=1.1,<2.0)"] [[package]] name = "langsmith" -version = "0.0.26" +version = "0.0.28" description = "Client library to connect to the LangSmith LLM Tracing and Evaluation Platform." optional = false python-versions = ">=3.8.1,<4.0" files = [ - {file = "langsmith-0.0.26-py3-none-any.whl", hash = "sha256:61c1d4582104d96edde04e1eea1dae347645b691c44489a5871341a2a1a2a1eb"}, - {file = "langsmith-0.0.26.tar.gz", hash = "sha256:80a4ef1b663a24a460d25b9986ab2010c5d06b6061c65be473abafc0647d191a"}, + {file = "langsmith-0.0.28-py3-none-any.whl", hash = "sha256:f398782f41526c74e141e68fa28b9020e0be4bde18a1d4a76b357c8272fb81bd"}, + {file = "langsmith-0.0.28.tar.gz", hash = "sha256:34c15f9a8908be180001c58048b659ece6320d0bf8ffce4ca496a2428b35646e"}, ] [package.dependencies] @@ -1620,6 +1697,23 @@ files = [ {file = "mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba"}, ] +[[package]] +name = "mpmath" +version = "1.3.0" +description = "Python library for arbitrary-precision floating-point arithmetic" +optional = false +python-versions = "*" +files = [ + {file = "mpmath-1.3.0-py3-none-any.whl", hash = "sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c"}, + {file = "mpmath-1.3.0.tar.gz", hash = "sha256:7a28eb2a9774d00c7bc92411c19a89209d5da7c4c9a9e227be8330a23a25b91f"}, +] + +[package.extras] +develop = ["codecov", "pycodestyle", "pytest (>=4.6)", "pytest-cov", "wheel"] +docs = ["sphinx"] +gmpy = ["gmpy2 (>=2.1.0a4)"] +tests = ["pytest (>=4.6)"] + [[package]] name = "multidict" version = "6.0.4" @@ -1751,6 +1845,24 @@ files = [ {file = "mypy_extensions-1.0.0.tar.gz", hash = "sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782"}, ] +[[package]] +name = "networkx" +version = "3.1" +description = "Python package for creating and manipulating graphs and networks" +optional = false +python-versions = ">=3.8" +files = [ + {file = "networkx-3.1-py3-none-any.whl", hash = "sha256:4f33f68cb2afcf86f28a45f43efc27a9386b535d567d2127f8f61d51dec58d36"}, + {file = "networkx-3.1.tar.gz", hash = "sha256:de346335408f84de0eada6ff9fafafff9bcda11f0a0dfaa931133debb146ab61"}, +] + +[package.extras] +default = ["matplotlib (>=3.4)", "numpy (>=1.20)", "pandas (>=1.3)", "scipy (>=1.8)"] +developer = ["mypy (>=1.1)", "pre-commit (>=3.2)"] +doc = ["nb2plots (>=0.6)", "numpydoc (>=1.5)", "pillow (>=9.4)", "pydata-sphinx-theme (>=0.13)", "sphinx (>=6.1)", "sphinx-gallery (>=0.12)", "texext (>=0.6.7)"] +extra = ["lxml (>=4.6)", "pydot (>=1.4.2)", "pygraphviz (>=1.10)", "sympy (>=1.10)"] +test = ["codecov (>=2.1)", "pytest (>=7.2)", "pytest-cov (>=4.0)"] + [[package]] name = "nltk" version = "3.8.1" @@ -2053,6 +2165,75 @@ files = [ [package.dependencies] ptyprocess = ">=0.5" +[[package]] +name = "pillow" +version = "10.0.0" +description = "Python Imaging Library (Fork)" +optional = false +python-versions = ">=3.8" +files = [ + {file = "Pillow-10.0.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:1f62406a884ae75fb2f818694469519fb685cc7eaff05d3451a9ebe55c646891"}, + {file = "Pillow-10.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d5db32e2a6ccbb3d34d87c87b432959e0db29755727afb37290e10f6e8e62614"}, + {file = "Pillow-10.0.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:edf4392b77bdc81f36e92d3a07a5cd072f90253197f4a52a55a8cec48a12483b"}, + {file = "Pillow-10.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:520f2a520dc040512699f20fa1c363eed506e94248d71f85412b625026f6142c"}, + {file = "Pillow-10.0.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:8c11160913e3dd06c8ffdb5f233a4f254cb449f4dfc0f8f4549eda9e542c93d1"}, + {file = "Pillow-10.0.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:a74ba0c356aaa3bb8e3eb79606a87669e7ec6444be352870623025d75a14a2bf"}, + {file = "Pillow-10.0.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:d5d0dae4cfd56969d23d94dc8e89fb6a217be461c69090768227beb8ed28c0a3"}, + {file = "Pillow-10.0.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:22c10cc517668d44b211717fd9775799ccec4124b9a7f7b3635fc5386e584992"}, + {file = "Pillow-10.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:dffe31a7f47b603318c609f378ebcd57f1554a3a6a8effbc59c3c69f804296de"}, + {file = "Pillow-10.0.0-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:9fb218c8a12e51d7ead2a7c9e101a04982237d4855716af2e9499306728fb485"}, + {file = "Pillow-10.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d35e3c8d9b1268cbf5d3670285feb3528f6680420eafe35cccc686b73c1e330f"}, + {file = "Pillow-10.0.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3ed64f9ca2f0a95411e88a4efbd7a29e5ce2cea36072c53dd9d26d9c76f753b3"}, + {file = "Pillow-10.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b6eb5502f45a60a3f411c63187db83a3d3107887ad0d036c13ce836f8a36f1d"}, + {file = "Pillow-10.0.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:c1fbe7621c167ecaa38ad29643d77a9ce7311583761abf7836e1510c580bf3dd"}, + {file = "Pillow-10.0.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:cd25d2a9d2b36fcb318882481367956d2cf91329f6892fe5d385c346c0649629"}, + {file = "Pillow-10.0.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:3b08d4cc24f471b2c8ca24ec060abf4bebc6b144cb89cba638c720546b1cf538"}, + {file = "Pillow-10.0.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:d737a602fbd82afd892ca746392401b634e278cb65d55c4b7a8f48e9ef8d008d"}, + {file = "Pillow-10.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:3a82c40d706d9aa9734289740ce26460a11aeec2d9c79b7af87bb35f0073c12f"}, + {file = "Pillow-10.0.0-cp311-cp311-win_arm64.whl", hash = "sha256:bc2ec7c7b5d66b8ec9ce9f720dbb5fa4bace0f545acd34870eff4a369b44bf37"}, + {file = "Pillow-10.0.0-cp312-cp312-macosx_10_10_x86_64.whl", hash = "sha256:d80cf684b541685fccdd84c485b31ce73fc5c9b5d7523bf1394ce134a60c6883"}, + {file = "Pillow-10.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:76de421f9c326da8f43d690110f0e79fe3ad1e54be811545d7d91898b4c8493e"}, + {file = "Pillow-10.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:81ff539a12457809666fef6624684c008e00ff6bf455b4b89fd00a140eecd640"}, + {file = "Pillow-10.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ce543ed15570eedbb85df19b0a1a7314a9c8141a36ce089c0a894adbfccb4568"}, + {file = "Pillow-10.0.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:685ac03cc4ed5ebc15ad5c23bc555d68a87777586d970c2c3e216619a5476223"}, + {file = "Pillow-10.0.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:d72e2ecc68a942e8cf9739619b7f408cc7b272b279b56b2c83c6123fcfa5cdff"}, + {file = "Pillow-10.0.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:d50b6aec14bc737742ca96e85d6d0a5f9bfbded018264b3b70ff9d8c33485551"}, + {file = "Pillow-10.0.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:00e65f5e822decd501e374b0650146063fbb30a7264b4d2744bdd7b913e0cab5"}, + {file = "Pillow-10.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:f31f9fdbfecb042d046f9d91270a0ba28368a723302786c0009ee9b9f1f60199"}, + {file = "Pillow-10.0.0-cp312-cp312-win_arm64.whl", hash = "sha256:1ce91b6ec08d866b14413d3f0bbdea7e24dfdc8e59f562bb77bc3fe60b6144ca"}, + {file = "Pillow-10.0.0-cp38-cp38-macosx_10_10_x86_64.whl", hash = "sha256:349930d6e9c685c089284b013478d6f76e3a534e36ddfa912cde493f235372f3"}, + {file = "Pillow-10.0.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:3a684105f7c32488f7153905a4e3015a3b6c7182e106fe3c37fbb5ef3e6994c3"}, + {file = "Pillow-10.0.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b4f69b3700201b80bb82c3a97d5e9254084f6dd5fb5b16fc1a7b974260f89f43"}, + {file = "Pillow-10.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3f07ea8d2f827d7d2a49ecf1639ec02d75ffd1b88dcc5b3a61bbb37a8759ad8d"}, + {file = "Pillow-10.0.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:040586f7d37b34547153fa383f7f9aed68b738992380ac911447bb78f2abe530"}, + {file = "Pillow-10.0.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:f88a0b92277de8e3ca715a0d79d68dc82807457dae3ab8699c758f07c20b3c51"}, + {file = "Pillow-10.0.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:c7cf14a27b0d6adfaebb3ae4153f1e516df54e47e42dcc073d7b3d76111a8d86"}, + {file = "Pillow-10.0.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:3400aae60685b06bb96f99a21e1ada7bc7a413d5f49bce739828ecd9391bb8f7"}, + {file = "Pillow-10.0.0-cp38-cp38-win_amd64.whl", hash = "sha256:dbc02381779d412145331789b40cc7b11fdf449e5d94f6bc0b080db0a56ea3f0"}, + {file = "Pillow-10.0.0-cp39-cp39-macosx_10_10_x86_64.whl", hash = "sha256:9211e7ad69d7c9401cfc0e23d49b69ca65ddd898976d660a2fa5904e3d7a9baa"}, + {file = "Pillow-10.0.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:faaf07ea35355b01a35cb442dd950d8f1bb5b040a7787791a535de13db15ed90"}, + {file = "Pillow-10.0.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c9f72a021fbb792ce98306ffb0c348b3c9cb967dce0f12a49aa4c3d3fdefa967"}, + {file = "Pillow-10.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9f7c16705f44e0504a3a2a14197c1f0b32a95731d251777dcb060aa83022cb2d"}, + {file = "Pillow-10.0.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:76edb0a1fa2b4745fb0c99fb9fb98f8b180a1bbceb8be49b087e0b21867e77d3"}, + {file = "Pillow-10.0.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:368ab3dfb5f49e312231b6f27b8820c823652b7cd29cfbd34090565a015e99ba"}, + {file = "Pillow-10.0.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:608bfdee0d57cf297d32bcbb3c728dc1da0907519d1784962c5f0c68bb93e5a3"}, + {file = "Pillow-10.0.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5c6e3df6bdd396749bafd45314871b3d0af81ff935b2d188385e970052091017"}, + {file = "Pillow-10.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:7be600823e4c8631b74e4a0d38384c73f680e6105a7d3c6824fcf226c178c7e6"}, + {file = "Pillow-10.0.0-pp310-pypy310_pp73-macosx_10_10_x86_64.whl", hash = "sha256:92be919bbc9f7d09f7ae343c38f5bb21c973d2576c1d45600fce4b74bafa7ac0"}, + {file = "Pillow-10.0.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8f8182b523b2289f7c415f589118228d30ac8c355baa2f3194ced084dac2dbba"}, + {file = "Pillow-10.0.0-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:38250a349b6b390ee6047a62c086d3817ac69022c127f8a5dc058c31ccef17f3"}, + {file = "Pillow-10.0.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:88af2003543cc40c80f6fca01411892ec52b11021b3dc22ec3bc9d5afd1c5334"}, + {file = "Pillow-10.0.0-pp39-pypy39_pp73-macosx_10_10_x86_64.whl", hash = "sha256:c189af0545965fa8d3b9613cfdb0cd37f9d71349e0f7750e1fd704648d475ed2"}, + {file = "Pillow-10.0.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ce7b031a6fc11365970e6a5686d7ba8c63e4c1cf1ea143811acbb524295eabed"}, + {file = "Pillow-10.0.0-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:db24668940f82321e746773a4bc617bfac06ec831e5c88b643f91f122a785684"}, + {file = "Pillow-10.0.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:efe8c0681042536e0d06c11f48cebe759707c9e9abf880ee213541c5b46c5bf3"}, + {file = "Pillow-10.0.0.tar.gz", hash = "sha256:9c82b5b3e043c7af0d95792d0d20ccf68f61a1fec6b3530e718b688422727396"}, +] + +[package.extras] +docs = ["furo", "olefile", "sphinx (>=2.4)", "sphinx-copybutton", "sphinx-inline-tabs", "sphinx-removed-in", "sphinxext-opengraph"] +tests = ["check-manifest", "coverage", "defusedxml", "markdown2", "olefile", "packaging", "pyroma", "pytest", "pytest-cov", "pytest-timeout"] + [[package]] name = "pinecone-client" version = "2.2.2" @@ -2100,13 +2281,13 @@ pyee = "9.0.4" [[package]] name = "pluggy" -version = "1.2.0" +version = "1.3.0" description = "plugin and hook calling mechanisms for python" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "pluggy-1.2.0-py3-none-any.whl", hash = "sha256:c2fd55a7d7a3863cba1a013e4e2414658b1d07b6bc57b3919e0c63c9abb99849"}, - {file = "pluggy-1.2.0.tar.gz", hash = "sha256:d12f0c4b579b15f5e054301bb226ee85eeeba08ffec228092f8defbaa3a4c4b3"}, + {file = "pluggy-1.3.0-py3-none-any.whl", hash = "sha256:d89c696a773f8bd377d18e5ecda92b7a3793cbe66c87060a6fb58c7b6e1061f7"}, + {file = "pluggy-1.3.0.tar.gz", hash = "sha256:cf61ae8f126ac6f7c451172cf30e3e43d3ca77615509771b3a984a0730651e12"}, ] [package.extras] @@ -2165,6 +2346,37 @@ files = [ cymem = ">=2.0.2,<2.1.0" murmurhash = ">=0.28.0,<1.1.0" +[[package]] +name = "protobuf" +version = "3.20.3" +description = "Protocol Buffers" +optional = false +python-versions = ">=3.7" +files = [ + {file = "protobuf-3.20.3-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:f4bd856d702e5b0d96a00ec6b307b0f51c1982c2bf9c0052cf9019e9a544ba99"}, + {file = "protobuf-3.20.3-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:9aae4406ea63d825636cc11ffb34ad3379335803216ee3a856787bcf5ccc751e"}, + {file = "protobuf-3.20.3-cp310-cp310-win32.whl", hash = "sha256:28545383d61f55b57cf4df63eebd9827754fd2dc25f80c5253f9184235db242c"}, + {file = "protobuf-3.20.3-cp310-cp310-win_amd64.whl", hash = "sha256:67a3598f0a2dcbc58d02dd1928544e7d88f764b47d4a286202913f0b2801c2e7"}, + {file = "protobuf-3.20.3-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:899dc660cd599d7352d6f10d83c95df430a38b410c1b66b407a6b29265d66469"}, + {file = "protobuf-3.20.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:e64857f395505ebf3d2569935506ae0dfc4a15cb80dc25261176c784662cdcc4"}, + {file = "protobuf-3.20.3-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:d9e4432ff660d67d775c66ac42a67cf2453c27cb4d738fc22cb53b5d84c135d4"}, + {file = "protobuf-3.20.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:74480f79a023f90dc6e18febbf7b8bac7508420f2006fabd512013c0c238f454"}, + {file = "protobuf-3.20.3-cp37-cp37m-win32.whl", hash = "sha256:b6cc7ba72a8850621bfec987cb72623e703b7fe2b9127a161ce61e61558ad905"}, + {file = "protobuf-3.20.3-cp37-cp37m-win_amd64.whl", hash = "sha256:8c0c984a1b8fef4086329ff8dd19ac77576b384079247c770f29cc8ce3afa06c"}, + {file = "protobuf-3.20.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:de78575669dddf6099a8a0f46a27e82a1783c557ccc38ee620ed8cc96d3be7d7"}, + {file = "protobuf-3.20.3-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:f4c42102bc82a51108e449cbb32b19b180022941c727bac0cfd50170341f16ee"}, + {file = "protobuf-3.20.3-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:44246bab5dd4b7fbd3c0c80b6f16686808fab0e4aca819ade6e8d294a29c7050"}, + {file = "protobuf-3.20.3-cp38-cp38-win32.whl", hash = "sha256:c02ce36ec760252242a33967d51c289fd0e1c0e6e5cc9397e2279177716add86"}, + {file = "protobuf-3.20.3-cp38-cp38-win_amd64.whl", hash = "sha256:447d43819997825d4e71bf5769d869b968ce96848b6479397e29fc24c4a5dfe9"}, + {file = "protobuf-3.20.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:398a9e0c3eaceb34ec1aee71894ca3299605fa8e761544934378bbc6c97de23b"}, + {file = "protobuf-3.20.3-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:bf01b5720be110540be4286e791db73f84a2b721072a3711efff6c324cdf074b"}, + {file = "protobuf-3.20.3-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:daa564862dd0d39c00f8086f88700fdbe8bc717e993a21e90711acfed02f2402"}, + {file = "protobuf-3.20.3-cp39-cp39-win32.whl", hash = "sha256:819559cafa1a373b7096a482b504ae8a857c89593cf3a25af743ac9ecbd23480"}, + {file = "protobuf-3.20.3-cp39-cp39-win_amd64.whl", hash = "sha256:03038ac1cfbc41aa21f6afcbcd357281d7521b4157926f30ebecc8d4ea59dcb7"}, + {file = "protobuf-3.20.3-py2.py3-none-any.whl", hash = "sha256:a7ca6d488aa8ff7f329d4c545b2dbad8ac31464f1d8b1c87ad1346717731e4db"}, + {file = "protobuf-3.20.3.tar.gz", hash = "sha256:2e3427429c9cffebf259491be0af70189607f365c2f41c7c3764af6f337105f2"}, +] + [[package]] name = "ptyprocess" version = "0.7.0" @@ -2281,13 +2493,13 @@ plugins = ["importlib-metadata"] [[package]] name = "pypdf" -version = "3.15.2" +version = "3.15.4" description = "A pure-python PDF library capable of splitting, merging, cropping, and transforming PDF files" optional = false python-versions = ">=3.6" files = [ - {file = "pypdf-3.15.2-py3-none-any.whl", hash = "sha256:f6e598292be34187287a609c72815c1502b3dc2c997b374ba0870ce79d2e975a"}, - {file = "pypdf-3.15.2.tar.gz", hash = "sha256:cdf7d75ebb8901f3352cf9488c5f662c6de9c52e432c429d15cada67ba372fce"}, + {file = "pypdf-3.15.4-py3-none-any.whl", hash = "sha256:791f0a52ddf390709f1f1b0c05c4d8cde13829b4f7cb91b4003b9bdd352bc944"}, + {file = "pypdf-3.15.4.tar.gz", hash = "sha256:a2780ed01dc4da23ac1542209f58fd3d951d8dd37c3c0309d123cd2f2679fb03"}, ] [package.extras] @@ -2732,6 +2944,165 @@ botocore = ">=1.12.36,<2.0a.0" [package.extras] crt = ["botocore[crt] (>=1.20.29,<2.0a.0)"] +[[package]] +name = "safetensors" +version = "0.3.3" +description = "Fast and Safe Tensor serialization" +optional = false +python-versions = "*" +files = [ + {file = "safetensors-0.3.3-cp310-cp310-macosx_10_11_x86_64.whl", hash = "sha256:92e4d0c8b2836120fddd134474c5bda8963f322333941f8b9f643e5b24f041eb"}, + {file = "safetensors-0.3.3-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:3dcadb6153c42addc9c625a622ebde9293fabe1973f9ef31ba10fb42c16e8536"}, + {file = "safetensors-0.3.3-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:08f26b61e1b0a14dc959aa9d568776bd038805f611caef1de04a80c468d4a7a4"}, + {file = "safetensors-0.3.3-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:17f41344d9a075f2f21b289a49a62e98baff54b5754240ba896063bce31626bf"}, + {file = "safetensors-0.3.3-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:f1045f798e1a16a6ced98d6a42ec72936d367a2eec81dc5fade6ed54638cd7d2"}, + {file = "safetensors-0.3.3-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:eaf0e4bc91da13f21ac846a39429eb3f3b7ed06295a32321fa3eb1a59b5c70f3"}, + {file = "safetensors-0.3.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:25149180d4dc8ca48bac2ac3852a9424b466e36336a39659b35b21b2116f96fc"}, + {file = "safetensors-0.3.3-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c9e943bf78c39de8865398a71818315e7d5d1af93c7b30d4da3fc852e62ad9bc"}, + {file = "safetensors-0.3.3-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cccfcac04a010354e87c7a2fe16a1ff004fc4f6e7ef8efc966ed30122ce00bc7"}, + {file = "safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a07121f427e646a50d18c1be0fa1a2cbf6398624c31149cd7e6b35486d72189e"}, + {file = "safetensors-0.3.3-cp310-cp310-win32.whl", hash = "sha256:a85e29cbfddfea86453cc0f4889b4bcc6b9c155be9a60e27be479a34e199e7ef"}, + {file = "safetensors-0.3.3-cp310-cp310-win_amd64.whl", hash = "sha256:e13adad4a3e591378f71068d14e92343e626cf698ff805f61cdb946e684a218e"}, + {file = "safetensors-0.3.3-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:cbc3312f134baf07334dd517341a4b470b2931f090bd9284888acb7dfaf4606f"}, + {file = "safetensors-0.3.3-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:d15030af39d5d30c22bcbc6d180c65405b7ea4c05b7bab14a570eac7d7d43722"}, + {file = "safetensors-0.3.3-cp311-cp311-macosx_12_0_universal2.whl", hash = "sha256:f84a74cbe9859b28e3d6d7715ac1dd3097bebf8d772694098f6d42435245860c"}, + {file = "safetensors-0.3.3-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:10d637423d98ab2e6a4ad96abf4534eb26fcaf8ca3115623e64c00759374e90d"}, + {file = "safetensors-0.3.3-cp311-cp311-macosx_13_0_universal2.whl", hash = "sha256:3b46f5de8b44084aff2e480874c550c399c730c84b2e8ad1bddb062c94aa14e9"}, + {file = "safetensors-0.3.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e76da691a82dfaf752854fa6d17c8eba0c8466370c5ad8cf1bfdf832d3c7ee17"}, + {file = "safetensors-0.3.3-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c4e342fd54e66aa9512dd13e410f791e47aa4feeb5f4c9a20882c72f3d272f29"}, + {file = "safetensors-0.3.3-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:178fd30b5dc73bce14a39187d948cedd0e5698e2f055b7ea16b5a96c9b17438e"}, + {file = "safetensors-0.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3e8fdf7407dba44587ed5e79d5de3533d242648e1f2041760b21474bd5ea5c8c"}, + {file = "safetensors-0.3.3-cp311-cp311-win32.whl", hash = "sha256:7d3b744cee8d7a46ffa68db1a2ff1a1a432488e3f7a5a97856fe69e22139d50c"}, + {file = "safetensors-0.3.3-cp311-cp311-win_amd64.whl", hash = "sha256:f579877d30feec9b6ba409d05fa174633a4fc095675a4a82971d831a8bb60b97"}, + {file = "safetensors-0.3.3-cp37-cp37m-macosx_10_11_x86_64.whl", hash = "sha256:2fff5b19a1b462c17322998b2f4b8bce43c16fe208968174d2f3a1446284ceed"}, + {file = "safetensors-0.3.3-cp37-cp37m-macosx_11_0_x86_64.whl", hash = "sha256:41adb1d39e8aad04b16879e3e0cbcb849315999fad73bc992091a01e379cb058"}, + {file = "safetensors-0.3.3-cp37-cp37m-macosx_12_0_x86_64.whl", hash = "sha256:0f2b404250b3b877b11d34afcc30d80e7035714a1116a3df56acaca6b6c00096"}, + {file = "safetensors-0.3.3-cp37-cp37m-macosx_13_0_x86_64.whl", hash = "sha256:b43956ef20e9f4f2e648818a9e7b3499edd6b753a0f5526d4f6a6826fbee8446"}, + {file = "safetensors-0.3.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d61a99b34169981f088ccfbb2c91170843efc869a0a0532f422db7211bf4f474"}, + {file = "safetensors-0.3.3-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c0008aab36cd20e9a051a68563c6f80d40f238c2611811d7faa5a18bf3fd3984"}, + {file = "safetensors-0.3.3-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:93d54166072b143084fdcd214a080a088050c1bb1651016b55942701b31334e4"}, + {file = "safetensors-0.3.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1c32ee08f61cea56a5d62bbf94af95df6040c8ab574afffaeb7b44ae5da1e9e3"}, + {file = "safetensors-0.3.3-cp37-cp37m-win32.whl", hash = "sha256:351600f367badd59f7bfe86d317bb768dd8c59c1561c6fac43cafbd9c1af7827"}, + {file = "safetensors-0.3.3-cp37-cp37m-win_amd64.whl", hash = "sha256:034717e297849dae1af0a7027a14b8647bd2e272c24106dced64d83e10d468d1"}, + {file = "safetensors-0.3.3-cp38-cp38-macosx_10_11_x86_64.whl", hash = "sha256:8530399666748634bc0b301a6a5523756931b0c2680d188e743d16304afe917a"}, + {file = "safetensors-0.3.3-cp38-cp38-macosx_11_0_x86_64.whl", hash = "sha256:9d741c1f1621e489ba10aa3d135b54202684f6e205df52e219d5eecd673a80c9"}, + {file = "safetensors-0.3.3-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:0c345fd85b4d2093a5109596ff4cd9dfc2e84992e881b4857fbc4a93a3b89ddb"}, + {file = "safetensors-0.3.3-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:69ccee8d05f55cdf76f7e6c87d2bdfb648c16778ef8acfd2ecc495e273e9233e"}, + {file = "safetensors-0.3.3-cp38-cp38-macosx_13_0_arm64.whl", hash = "sha256:c08a9a4b7a4ca389232fa8d097aebc20bbd4f61e477abc7065b5c18b8202dede"}, + {file = "safetensors-0.3.3-cp38-cp38-macosx_13_0_x86_64.whl", hash = "sha256:a002868d2e3f49bbe81bee2655a411c24fa1f8e68b703dec6629cb989d6ae42e"}, + {file = "safetensors-0.3.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3bd2704cb41faa44d3ec23e8b97330346da0395aec87f8eaf9c9e2c086cdbf13"}, + {file = "safetensors-0.3.3-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4b2951bf3f0ad63df5e6a95263652bd6c194a6eb36fd4f2d29421cd63424c883"}, + {file = "safetensors-0.3.3-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:07114cec116253ca2e7230fdea30acf76828f21614afd596d7b5438a2f719bd8"}, + {file = "safetensors-0.3.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6ab43aeeb9eadbb6b460df3568a662e6f1911ecc39387f8752afcb6a7d96c087"}, + {file = "safetensors-0.3.3-cp38-cp38-win32.whl", hash = "sha256:f2f59fce31dd3429daca7269a6b06f65e6547a0c248f5116976c3f1e9b73f251"}, + {file = "safetensors-0.3.3-cp38-cp38-win_amd64.whl", hash = "sha256:c31ca0d8610f57799925bf08616856b39518ab772c65093ef1516762e796fde4"}, + {file = "safetensors-0.3.3-cp39-cp39-macosx_10_11_x86_64.whl", hash = "sha256:59a596b3225c96d59af412385981f17dd95314e3fffdf359c7e3f5bb97730a19"}, + {file = "safetensors-0.3.3-cp39-cp39-macosx_11_0_x86_64.whl", hash = "sha256:82a16e92210a6221edd75ab17acdd468dd958ef5023d9c6c1289606cc30d1479"}, + {file = "safetensors-0.3.3-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:98a929e763a581f516373ef31983ed1257d2d0da912a8e05d5cd12e9e441c93a"}, + {file = "safetensors-0.3.3-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:12b83f1986cd16ea0454c636c37b11e819d60dd952c26978310a0835133480b7"}, + {file = "safetensors-0.3.3-cp39-cp39-macosx_13_0_arm64.whl", hash = "sha256:f439175c827c2f1bbd54df42789c5204a10983a30bc4242bc7deaf854a24f3f0"}, + {file = "safetensors-0.3.3-cp39-cp39-macosx_13_0_x86_64.whl", hash = "sha256:0085be33b8cbcb13079b3a8e131656e05b0bc5e6970530d4c24150f7afd76d70"}, + {file = "safetensors-0.3.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3e3ec70c87b1e910769034206ad5efc051069b105aac1687f6edcd02526767f4"}, + {file = "safetensors-0.3.3-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f490132383e5e490e710608f4acffcb98ed37f91b885c7217d3f9f10aaff9048"}, + {file = "safetensors-0.3.3-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:79d1b6c7ed5596baf79c80fbce5198c3cdcc521ae6a157699f427aba1a90082d"}, + {file = "safetensors-0.3.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ad3cc8006e7a86ee7c88bd2813ec59cd7cc75b03e6fa4af89b9c7b235b438d68"}, + {file = "safetensors-0.3.3-cp39-cp39-win32.whl", hash = "sha256:ab29f54c6b8c301ca05fa014728996bd83aac6e21528f893aaf8945c71f42b6d"}, + {file = "safetensors-0.3.3-cp39-cp39-win_amd64.whl", hash = "sha256:0fa82004eae1a71e2aa29843ef99de9350e459a0fc2f65fc6ee0da9690933d2d"}, + {file = "safetensors-0.3.3.tar.gz", hash = "sha256:edb7072d788c4f929d0f5735d3a2fb51e5a27f833587828583b7f5747af1a2b8"}, +] + +[package.extras] +all = ["black (==22.3)", "click (==8.0.4)", "flake8 (>=3.8.3)", "flax (>=0.6.3)", "h5py (>=3.7.0)", "huggingface-hub (>=0.12.1)", "isort (>=5.5.4)", "jax (>=0.3.25)", "jaxlib (>=0.3.25)", "numpy (>=1.21.6)", "paddlepaddle (>=2.4.1)", "pytest (>=7.2.0)", "pytest-benchmark (>=4.0.0)", "setuptools-rust (>=1.5.2)", "tensorflow (==2.11.0)", "torch (>=1.10)"] +dev = ["black (==22.3)", "click (==8.0.4)", "flake8 (>=3.8.3)", "flax (>=0.6.3)", "h5py (>=3.7.0)", "huggingface-hub (>=0.12.1)", "isort (>=5.5.4)", "jax (>=0.3.25)", "jaxlib (>=0.3.25)", "numpy (>=1.21.6)", "paddlepaddle (>=2.4.1)", "pytest (>=7.2.0)", "pytest-benchmark (>=4.0.0)", "setuptools-rust (>=1.5.2)", "tensorflow (==2.11.0)", "torch (>=1.10)"] +jax = ["flax (>=0.6.3)", "jax (>=0.3.25)", "jaxlib (>=0.3.25)", "numpy (>=1.21.6)"] +numpy = ["numpy (>=1.21.6)"] +paddlepaddle = ["numpy (>=1.21.6)", "paddlepaddle (>=2.4.1)"] +pinned-tf = ["tensorflow (==2.11.0)"] +quality = ["black (==22.3)", "click (==8.0.4)", "flake8 (>=3.8.3)", "isort (>=5.5.4)"] +tensorflow = ["numpy (>=1.21.6)", "tensorflow (>=2.11.0)"] +testing = ["h5py (>=3.7.0)", "huggingface-hub (>=0.12.1)", "numpy (>=1.21.6)", "pytest (>=7.2.0)", "pytest-benchmark (>=4.0.0)", "setuptools-rust (>=1.5.2)"] +torch = ["numpy (>=1.21.6)", "torch (>=1.10)"] + +[[package]] +name = "scikit-learn" +version = "1.3.0" +description = "A set of python modules for machine learning and data mining" +optional = false +python-versions = ">=3.8" +files = [ + {file = "scikit-learn-1.3.0.tar.gz", hash = "sha256:8be549886f5eda46436b6e555b0e4873b4f10aa21c07df45c4bc1735afbccd7a"}, + {file = "scikit_learn-1.3.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:981287869e576d42c682cf7ca96af0c6ac544ed9316328fd0d9292795c742cf5"}, + {file = "scikit_learn-1.3.0-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:436aaaae2c916ad16631142488e4c82f4296af2404f480e031d866863425d2a2"}, + {file = "scikit_learn-1.3.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c7e28d8fa47a0b30ae1bd7a079519dd852764e31708a7804da6cb6f8b36e3630"}, + {file = "scikit_learn-1.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ae80c08834a473d08a204d966982a62e11c976228d306a2648c575e3ead12111"}, + {file = "scikit_learn-1.3.0-cp310-cp310-win_amd64.whl", hash = "sha256:552fd1b6ee22900cf1780d7386a554bb96949e9a359999177cf30211e6b20df6"}, + {file = "scikit_learn-1.3.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:79970a6d759eb00a62266a31e2637d07d2d28446fca8079cf9afa7c07b0427f8"}, + {file = "scikit_learn-1.3.0-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:850a00b559e636b23901aabbe79b73dc604b4e4248ba9e2d6e72f95063765603"}, + {file = "scikit_learn-1.3.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ee04835fb016e8062ee9fe9074aef9b82e430504e420bff51e3e5fffe72750ca"}, + {file = "scikit_learn-1.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9d953531f5d9f00c90c34fa3b7d7cfb43ecff4c605dac9e4255a20b114a27369"}, + {file = "scikit_learn-1.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:151ac2bf65ccf363664a689b8beafc9e6aae36263db114b4ca06fbbbf827444a"}, + {file = "scikit_learn-1.3.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:6a885a9edc9c0a341cab27ec4f8a6c58b35f3d449c9d2503a6fd23e06bbd4f6a"}, + {file = "scikit_learn-1.3.0-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:9877af9c6d1b15486e18a94101b742e9d0d2f343d35a634e337411ddb57783f3"}, + {file = "scikit_learn-1.3.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c470f53cea065ff3d588050955c492793bb50c19a92923490d18fcb637f6383a"}, + {file = "scikit_learn-1.3.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd6e2d7389542eae01077a1ee0318c4fec20c66c957f45c7aac0c6eb0fe3c612"}, + {file = "scikit_learn-1.3.0-cp38-cp38-win_amd64.whl", hash = "sha256:3a11936adbc379a6061ea32fa03338d4ca7248d86dd507c81e13af428a5bc1db"}, + {file = "scikit_learn-1.3.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:998d38fcec96584deee1e79cd127469b3ad6fefd1ea6c2dfc54e8db367eb396b"}, + {file = "scikit_learn-1.3.0-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:ded35e810438a527e17623ac6deae3b360134345b7c598175ab7741720d7ffa7"}, + {file = "scikit_learn-1.3.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0e8102d5036e28d08ab47166b48c8d5e5810704daecf3a476a4282d562be9a28"}, + {file = "scikit_learn-1.3.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7617164951c422747e7c32be4afa15d75ad8044f42e7d70d3e2e0429a50e6718"}, + {file = "scikit_learn-1.3.0-cp39-cp39-win_amd64.whl", hash = "sha256:1d54fb9e6038284548072df22fd34777e434153f7ffac72c8596f2d6987110dd"}, +] + +[package.dependencies] +joblib = ">=1.1.1" +numpy = ">=1.17.3" +scipy = ">=1.5.0" +threadpoolctl = ">=2.0.0" + +[package.extras] +benchmark = ["matplotlib (>=3.1.3)", "memory-profiler (>=0.57.0)", "pandas (>=1.0.5)"] +docs = ["Pillow (>=7.1.2)", "matplotlib (>=3.1.3)", "memory-profiler (>=0.57.0)", "numpydoc (>=1.2.0)", "pandas (>=1.0.5)", "plotly (>=5.14.0)", "pooch (>=1.6.0)", "scikit-image (>=0.16.2)", "seaborn (>=0.9.0)", "sphinx (>=6.0.0)", "sphinx-copybutton (>=0.5.2)", "sphinx-gallery (>=0.10.1)", "sphinx-prompt (>=1.3.0)", "sphinxext-opengraph (>=0.4.2)"] +examples = ["matplotlib (>=3.1.3)", "pandas (>=1.0.5)", "plotly (>=5.14.0)", "pooch (>=1.6.0)", "scikit-image (>=0.16.2)", "seaborn (>=0.9.0)"] +tests = ["black (>=23.3.0)", "matplotlib (>=3.1.3)", "mypy (>=1.3)", "numpydoc (>=1.2.0)", "pandas (>=1.0.5)", "pooch (>=1.6.0)", "pyamg (>=4.0.0)", "pytest (>=7.1.2)", "pytest-cov (>=2.9.0)", "ruff (>=0.0.272)", "scikit-image (>=0.16.2)"] + +[[package]] +name = "scipy" +version = "1.9.3" +description = "Fundamental algorithms for scientific computing in Python" +optional = false +python-versions = ">=3.8" +files = [ + {file = "scipy-1.9.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1884b66a54887e21addf9c16fb588720a8309a57b2e258ae1c7986d4444d3bc0"}, + {file = "scipy-1.9.3-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:83b89e9586c62e787f5012e8475fbb12185bafb996a03257e9675cd73d3736dd"}, + {file = "scipy-1.9.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1a72d885fa44247f92743fc20732ae55564ff2a519e8302fb7e18717c5355a8b"}, + {file = "scipy-1.9.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d01e1dd7b15bd2449c8bfc6b7cc67d630700ed655654f0dfcf121600bad205c9"}, + {file = "scipy-1.9.3-cp310-cp310-win_amd64.whl", hash = "sha256:68239b6aa6f9c593da8be1509a05cb7f9efe98b80f43a5861cd24c7557e98523"}, + {file = "scipy-1.9.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b41bc822679ad1c9a5f023bc93f6d0543129ca0f37c1ce294dd9d386f0a21096"}, + {file = "scipy-1.9.3-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:90453d2b93ea82a9f434e4e1cba043e779ff67b92f7a0e85d05d286a3625df3c"}, + {file = "scipy-1.9.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:83c06e62a390a9167da60bedd4575a14c1f58ca9dfde59830fc42e5197283dab"}, + {file = "scipy-1.9.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:abaf921531b5aeaafced90157db505e10345e45038c39e5d9b6c7922d68085cb"}, + {file = "scipy-1.9.3-cp311-cp311-win_amd64.whl", hash = "sha256:06d2e1b4c491dc7d8eacea139a1b0b295f74e1a1a0f704c375028f8320d16e31"}, + {file = "scipy-1.9.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:5a04cd7d0d3eff6ea4719371cbc44df31411862b9646db617c99718ff68d4840"}, + {file = "scipy-1.9.3-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:545c83ffb518094d8c9d83cce216c0c32f8c04aaf28b92cc8283eda0685162d5"}, + {file = "scipy-1.9.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0d54222d7a3ba6022fdf5773931b5d7c56efe41ede7f7128c7b1637700409108"}, + {file = "scipy-1.9.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cff3a5295234037e39500d35316a4c5794739433528310e117b8a9a0c76d20fc"}, + {file = "scipy-1.9.3-cp38-cp38-win_amd64.whl", hash = "sha256:2318bef588acc7a574f5bfdff9c172d0b1bf2c8143d9582e05f878e580a3781e"}, + {file = "scipy-1.9.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:d644a64e174c16cb4b2e41dfea6af722053e83d066da7343f333a54dae9bc31c"}, + {file = "scipy-1.9.3-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:da8245491d73ed0a994ed9c2e380fd058ce2fa8a18da204681f2fe1f57f98f95"}, + {file = "scipy-1.9.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4db5b30849606a95dcf519763dd3ab6fe9bd91df49eba517359e450a7d80ce2e"}, + {file = "scipy-1.9.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c68db6b290cbd4049012990d7fe71a2abd9ffbe82c0056ebe0f01df8be5436b0"}, + {file = "scipy-1.9.3-cp39-cp39-win_amd64.whl", hash = "sha256:5b88e6d91ad9d59478fafe92a7c757d00c59e3bdc3331be8ada76a4f8d683f58"}, + {file = "scipy-1.9.3.tar.gz", hash = "sha256:fbc5c05c85c1a02be77b1ff591087c83bc44579c6d2bd9fb798bb64ea5e1a027"}, +] + +[package.dependencies] +numpy = ">=1.18.5,<1.26.0" + +[package.extras] +dev = ["flake8", "mypy", "pycodestyle", "typing_extensions"] +doc = ["matplotlib (>2)", "numpydoc", "pydata-sphinx-theme (==0.9.0)", "sphinx (!=4.1.0)", "sphinx-panels (>=0.5.2)", "sphinx-tabs"] +test = ["asv", "gmpy2", "mpmath", "pytest", "pytest-cov", "pytest-xdist", "scikit-umfpack", "threadpoolctl"] + [[package]] name = "selenium" version = "4.11.2" @@ -2760,6 +3131,82 @@ files = [ {file = "semver-3.0.1.tar.gz", hash = "sha256:9ec78c5447883c67b97f98c3b6212796708191d22e4ad30f4570f840171cbce1"}, ] +[[package]] +name = "sentence-transformers" +version = "2.2.2" +description = "Multilingual text embeddings" +optional = false +python-versions = ">=3.6.0" +files = [ + {file = "sentence-transformers-2.2.2.tar.gz", hash = "sha256:dbc60163b27de21076c9a30d24b5b7b6fa05141d68cf2553fa9a77bf79a29136"}, +] + +[package.dependencies] +huggingface-hub = ">=0.4.0" +nltk = "*" +numpy = "*" +scikit-learn = "*" +scipy = "*" +sentencepiece = "*" +torch = ">=1.6.0" +torchvision = "*" +tqdm = "*" +transformers = ">=4.6.0,<5.0.0" + +[[package]] +name = "sentencepiece" +version = "0.1.99" +description = "SentencePiece python wrapper" +optional = false +python-versions = "*" +files = [ + {file = "sentencepiece-0.1.99-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:0eb528e70571b7c02723e5804322469b82fe7ea418c96051d0286c0fa028db73"}, + {file = "sentencepiece-0.1.99-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:77d7fafb2c4e4659cbdf303929503f37a26eabc4ff31d3a79bf1c5a1b338caa7"}, + {file = "sentencepiece-0.1.99-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:be9cf5b9e404c245aeb3d3723c737ba7a8f5d4ba262ef233a431fa6c45f732a0"}, + {file = "sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:baed1a26464998f9710d20e52607c29ffd4293e7c71c6a1f83f51ad0911ec12c"}, + {file = "sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9832f08bb372d4c8b567612f8eab9e36e268dff645f1c28f9f8e851be705f6d1"}, + {file = "sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:019e7535108e309dae2b253a75834fc3128240aa87c00eb80732078cdc182588"}, + {file = "sentencepiece-0.1.99-cp310-cp310-win32.whl", hash = "sha256:fa16a830416bb823fa2a52cbdd474d1f7f3bba527fd2304fb4b140dad31bb9bc"}, + {file = "sentencepiece-0.1.99-cp310-cp310-win_amd64.whl", hash = "sha256:14b0eccb7b641d4591c3e12ae44cab537d68352e4d3b6424944f0c447d2348d5"}, + {file = "sentencepiece-0.1.99-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:6d3c56f24183a1e8bd61043ff2c58dfecdc68a5dd8955dc13bab83afd5f76b81"}, + {file = "sentencepiece-0.1.99-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ed6ea1819fd612c989999e44a51bf556d0ef6abfb553080b9be3d347e18bcfb7"}, + {file = "sentencepiece-0.1.99-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a2a0260cd1fb7bd8b4d4f39dc2444a8d5fd4e0a0c4d5c899810ef1abf99b2d45"}, + {file = "sentencepiece-0.1.99-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8a1abff4d1ff81c77cac3cc6fefa34fa4b8b371e5ee51cb7e8d1ebc996d05983"}, + {file = "sentencepiece-0.1.99-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:004e6a621d4bc88978eecb6ea7959264239a17b70f2cbc348033d8195c9808ec"}, + {file = "sentencepiece-0.1.99-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:db361e03342c41680afae5807590bc88aa0e17cfd1a42696a160e4005fcda03b"}, + {file = "sentencepiece-0.1.99-cp311-cp311-win32.whl", hash = "sha256:2d95e19168875b70df62916eb55428a0cbcb834ac51d5a7e664eda74def9e1e0"}, + {file = "sentencepiece-0.1.99-cp311-cp311-win_amd64.whl", hash = "sha256:f90d73a6f81248a909f55d8e6ef56fec32d559e1e9af045f0b0322637cb8e5c7"}, + {file = "sentencepiece-0.1.99-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:62e24c81e74bd87a6e0d63c51beb6527e4c0add67e1a17bac18bcd2076afcfeb"}, + {file = "sentencepiece-0.1.99-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:57efcc2d51caff20d9573567d9fd3f854d9efe613ed58a439c78c9f93101384a"}, + {file = "sentencepiece-0.1.99-cp36-cp36m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6a904c46197993bd1e95b93a6e373dca2f170379d64441041e2e628ad4afb16f"}, + {file = "sentencepiece-0.1.99-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d89adf59854741c0d465f0e1525b388c0d174f611cc04af54153c5c4f36088c4"}, + {file = "sentencepiece-0.1.99-cp36-cp36m-win32.whl", hash = "sha256:47c378146928690d1bc106fdf0da768cebd03b65dd8405aa3dd88f9c81e35dba"}, + {file = "sentencepiece-0.1.99-cp36-cp36m-win_amd64.whl", hash = "sha256:9ba142e7a90dd6d823c44f9870abdad45e6c63958eb60fe44cca6828d3b69da2"}, + {file = "sentencepiece-0.1.99-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:b7b1a9ae4d7c6f1f867e63370cca25cc17b6f4886729595b885ee07a58d3cec3"}, + {file = "sentencepiece-0.1.99-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d0f644c9d4d35c096a538507b2163e6191512460035bf51358794a78515b74f7"}, + {file = "sentencepiece-0.1.99-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c8843d23a0f686d85e569bd6dcd0dd0e0cbc03731e63497ca6d5bacd18df8b85"}, + {file = "sentencepiece-0.1.99-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:33e6f690a1caebb4867a2e367afa1918ad35be257ecdb3455d2bbd787936f155"}, + {file = "sentencepiece-0.1.99-cp37-cp37m-win32.whl", hash = "sha256:8a321866c2f85da7beac74a824b4ad6ddc2a4c9bccd9382529506d48f744a12c"}, + {file = "sentencepiece-0.1.99-cp37-cp37m-win_amd64.whl", hash = "sha256:c42f753bcfb7661c122a15b20be7f684b61fc8592c89c870adf52382ea72262d"}, + {file = "sentencepiece-0.1.99-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:85b476406da69c70586f0bb682fcca4c9b40e5059814f2db92303ea4585c650c"}, + {file = "sentencepiece-0.1.99-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:cfbcfe13c69d3f87b7fcd5da168df7290a6d006329be71f90ba4f56bc77f8561"}, + {file = "sentencepiece-0.1.99-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:445b0ec381af1cd4eef95243e7180c63d9c384443c16c4c47a28196bd1cda937"}, + {file = "sentencepiece-0.1.99-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c6890ea0f2b4703f62d0bf27932e35808b1f679bdb05c7eeb3812b935ba02001"}, + {file = "sentencepiece-0.1.99-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fb71af492b0eefbf9f2501bec97bcd043b6812ab000d119eaf4bd33f9e283d03"}, + {file = "sentencepiece-0.1.99-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:27b866b5bd3ddd54166bbcbf5c8d7dd2e0b397fac8537991c7f544220b1f67bc"}, + {file = "sentencepiece-0.1.99-cp38-cp38-win32.whl", hash = "sha256:b133e8a499eac49c581c3c76e9bdd08c338cc1939e441fee6f92c0ccb5f1f8be"}, + {file = "sentencepiece-0.1.99-cp38-cp38-win_amd64.whl", hash = "sha256:0eaf3591dd0690a87f44f4df129cf8d05d8a4029b5b6709b489b8e27f9a9bcff"}, + {file = "sentencepiece-0.1.99-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:38efeda9bbfb55052d482a009c6a37e52f42ebffcea9d3a98a61de7aee356a28"}, + {file = "sentencepiece-0.1.99-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6c030b081dc1e1bcc9fadc314b19b740715d3d566ad73a482da20d7d46fd444c"}, + {file = "sentencepiece-0.1.99-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:84dbe53e02e4f8a2e45d2ac3e430d5c83182142658e25edd76539b7648928727"}, + {file = "sentencepiece-0.1.99-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0b0f55d0a0ee1719b4b04221fe0c9f0c3461dc3dabd77a035fa2f4788eb3ef9a"}, + {file = "sentencepiece-0.1.99-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:18e800f206cd235dc27dc749299e05853a4e4332e8d3dfd81bf13d0e5b9007d9"}, + {file = "sentencepiece-0.1.99-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2ae1c40cda8f9d5b0423cfa98542735c0235e7597d79caf318855cdf971b2280"}, + {file = "sentencepiece-0.1.99-cp39-cp39-win32.whl", hash = "sha256:c84ce33af12ca222d14a1cdd37bd76a69401e32bc68fe61c67ef6b59402f4ab8"}, + {file = "sentencepiece-0.1.99-cp39-cp39-win_amd64.whl", hash = "sha256:350e5c74d739973f1c9643edb80f7cc904dc948578bcb1d43c6f2b173e5d18dd"}, + {file = "sentencepiece-0.1.99.tar.gz", hash = "sha256:189c48f5cb2949288f97ccdb97f0473098d9c3dcf5a3d99d4eabe719ec27297f"}, +] + [[package]] name = "setuptools" version = "68.1.2" @@ -3190,6 +3637,34 @@ anyio = ">=3.4.0,<5" [package.extras] full = ["httpx (>=0.22.0)", "itsdangerous", "jinja2", "python-multipart", "pyyaml"] +[[package]] +name = "sympy" +version = "1.12" +description = "Computer algebra system (CAS) in Python" +optional = false +python-versions = ">=3.8" +files = [ + {file = "sympy-1.12-py3-none-any.whl", hash = "sha256:c3588cd4295d0c0f603d0f2ae780587e64e2efeedb3521e46b9bb1d08d184fa5"}, + {file = "sympy-1.12.tar.gz", hash = "sha256:ebf595c8dac3e0fdc4152c51878b498396ec7f30e7a914d6071e674d49420fb8"}, +] + +[package.dependencies] +mpmath = ">=0.19" + +[[package]] +name = "tabulate" +version = "0.9.0" +description = "Pretty-print tabular data" +optional = false +python-versions = ">=3.7" +files = [ + {file = "tabulate-0.9.0-py3-none-any.whl", hash = "sha256:024ca478df22e9340661486f85298cff5f6dcdba14f3813e8830015b9ed1948f"}, + {file = "tabulate-0.9.0.tar.gz", hash = "sha256:0095b12bf5966de529c0feb1fa08671671b3368eec77d7ef7ab114be2c068b3c"}, +] + +[package.extras] +widechars = ["wcwidth"] + [[package]] name = "tenacity" version = "8.2.3" @@ -3290,6 +3765,17 @@ mxnet = ["mxnet (>=1.5.1,<1.6.0)"] tensorflow = ["tensorflow (>=2.0.0,<2.6.0)"] torch = ["torch (>=1.6.0)"] +[[package]] +name = "threadpoolctl" +version = "3.2.0" +description = "threadpoolctl" +optional = false +python-versions = ">=3.8" +files = [ + {file = "threadpoolctl-3.2.0-py3-none-any.whl", hash = "sha256:2b7818516e423bdaebb97c723f86a7c6b0a83d3f3b0970328d66f4d9104dc032"}, + {file = "threadpoolctl-3.2.0.tar.gz", hash = "sha256:c96a0ba3bdddeaca37dc4cc7344aafad41cdb8c313f74fdfe387a867bba93355"}, +] + [[package]] name = "tiktoken" version = "0.4.0" @@ -3335,6 +3821,60 @@ requests = ">=2.26.0" [package.extras] blobfile = ["blobfile (>=2)"] +[[package]] +name = "tokenizers" +version = "0.13.3" +description = "Fast and Customizable Tokenizers" +optional = false +python-versions = "*" +files = [ + {file = "tokenizers-0.13.3-cp310-cp310-macosx_10_11_x86_64.whl", hash = "sha256:f3835c5be51de8c0a092058a4d4380cb9244fb34681fd0a295fbf0a52a5fdf33"}, + {file = "tokenizers-0.13.3-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:4ef4c3e821730f2692489e926b184321e887f34fb8a6b80b8096b966ba663d07"}, + {file = "tokenizers-0.13.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c5fd1a6a25353e9aa762e2aae5a1e63883cad9f4e997c447ec39d071020459bc"}, + {file = "tokenizers-0.13.3-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ee0b1b311d65beab83d7a41c56a1e46ab732a9eed4460648e8eb0bd69fc2d059"}, + {file = "tokenizers-0.13.3-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5ef4215284df1277dadbcc5e17d4882bda19f770d02348e73523f7e7d8b8d396"}, + {file = "tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a4d53976079cff8a033f778fb9adca2d9d69d009c02fa2d71a878b5f3963ed30"}, + {file = "tokenizers-0.13.3-cp310-cp310-win32.whl", hash = "sha256:1f0e3b4c2ea2cd13238ce43548959c118069db7579e5d40ec270ad77da5833ce"}, + {file = "tokenizers-0.13.3-cp310-cp310-win_amd64.whl", hash = "sha256:89649c00d0d7211e8186f7a75dfa1db6996f65edce4b84821817eadcc2d3c79e"}, + {file = "tokenizers-0.13.3-cp311-cp311-macosx_10_11_universal2.whl", hash = "sha256:56b726e0d2bbc9243872b0144515ba684af5b8d8cd112fb83ee1365e26ec74c8"}, + {file = "tokenizers-0.13.3-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:cc5c022ce692e1f499d745af293ab9ee6f5d92538ed2faf73f9708c89ee59ce6"}, + {file = "tokenizers-0.13.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f55c981ac44ba87c93e847c333e58c12abcbb377a0c2f2ef96e1a266e4184ff2"}, + {file = "tokenizers-0.13.3-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f247eae99800ef821a91f47c5280e9e9afaeed9980fc444208d5aa6ba69ff148"}, + {file = "tokenizers-0.13.3-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4b3e3215d048e94f40f1c95802e45dcc37c5b05eb46280fc2ccc8cd351bff839"}, + {file = "tokenizers-0.13.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9ba2b0bf01777c9b9bc94b53764d6684554ce98551fec496f71bc5be3a03e98b"}, + {file = "tokenizers-0.13.3-cp311-cp311-win32.whl", hash = "sha256:cc78d77f597d1c458bf0ea7c2a64b6aa06941c7a99cb135b5969b0278824d808"}, + {file = "tokenizers-0.13.3-cp311-cp311-win_amd64.whl", hash = "sha256:ecf182bf59bd541a8876deccf0360f5ae60496fd50b58510048020751cf1724c"}, + {file = "tokenizers-0.13.3-cp37-cp37m-macosx_10_11_x86_64.whl", hash = "sha256:0527dc5436a1f6bf2c0327da3145687d3bcfbeab91fed8458920093de3901b44"}, + {file = "tokenizers-0.13.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:07cbb2c307627dc99b44b22ef05ff4473aa7c7cc1fec8f0a8b37d8a64b1a16d2"}, + {file = "tokenizers-0.13.3-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4560dbdeaae5b7ee0d4e493027e3de6d53c991b5002d7ff95083c99e11dd5ac0"}, + {file = "tokenizers-0.13.3-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:64064bd0322405c9374305ab9b4c07152a1474370327499911937fd4a76d004b"}, + {file = "tokenizers-0.13.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b8c6e2ab0f2e3d939ca66aa1d596602105fe33b505cd2854a4c1717f704c51de"}, + {file = "tokenizers-0.13.3-cp37-cp37m-win32.whl", hash = "sha256:6cc29d410768f960db8677221e497226e545eaaea01aa3613fa0fdf2cc96cff4"}, + {file = "tokenizers-0.13.3-cp37-cp37m-win_amd64.whl", hash = "sha256:fc2a7fdf864554a0dacf09d32e17c0caa9afe72baf9dd7ddedc61973bae352d8"}, + {file = "tokenizers-0.13.3-cp38-cp38-macosx_10_11_x86_64.whl", hash = "sha256:8791dedba834c1fc55e5f1521be325ea3dafb381964be20684b92fdac95d79b7"}, + {file = "tokenizers-0.13.3-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:d607a6a13718aeb20507bdf2b96162ead5145bbbfa26788d6b833f98b31b26e1"}, + {file = "tokenizers-0.13.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3791338f809cd1bf8e4fee6b540b36822434d0c6c6bc47162448deee3f77d425"}, + {file = "tokenizers-0.13.3-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c2f35f30e39e6aab8716f07790f646bdc6e4a853816cc49a95ef2a9016bf9ce6"}, + {file = "tokenizers-0.13.3-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:310204dfed5aa797128b65d63538a9837cbdd15da2a29a77d67eefa489edda26"}, + {file = "tokenizers-0.13.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a0f9b92ea052305166559f38498b3b0cae159caea712646648aaa272f7160963"}, + {file = "tokenizers-0.13.3-cp38-cp38-win32.whl", hash = "sha256:9a3fa134896c3c1f0da6e762d15141fbff30d094067c8f1157b9fdca593b5806"}, + {file = "tokenizers-0.13.3-cp38-cp38-win_amd64.whl", hash = "sha256:8e7b0cdeace87fa9e760e6a605e0ae8fc14b7d72e9fc19c578116f7287bb873d"}, + {file = "tokenizers-0.13.3-cp39-cp39-macosx_10_11_x86_64.whl", hash = "sha256:00cee1e0859d55507e693a48fa4aef07060c4bb6bd93d80120e18fea9371c66d"}, + {file = "tokenizers-0.13.3-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:a23ff602d0797cea1d0506ce69b27523b07e70f6dda982ab8cf82402de839088"}, + {file = "tokenizers-0.13.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:70ce07445050b537d2696022dafb115307abdffd2a5c106f029490f84501ef97"}, + {file = "tokenizers-0.13.3-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:280ffe95f50eaaf655b3a1dc7ff1d9cf4777029dbbc3e63a74e65a056594abc3"}, + {file = "tokenizers-0.13.3-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:97acfcec592f7e9de8cadcdcda50a7134423ac8455c0166b28c9ff04d227b371"}, + {file = "tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd7730c98a3010cd4f523465867ff95cd9d6430db46676ce79358f65ae39797b"}, + {file = "tokenizers-0.13.3-cp39-cp39-win32.whl", hash = "sha256:48625a108029cb1ddf42e17a81b5a3230ba6888a70c9dc14e81bc319e812652d"}, + {file = "tokenizers-0.13.3-cp39-cp39-win_amd64.whl", hash = "sha256:bc0a6f1ba036e482db6453571c9e3e60ecd5489980ffd95d11dc9f960483d783"}, + {file = "tokenizers-0.13.3.tar.gz", hash = "sha256:2e546dbb68b623008a5442353137fbb0123d311a6d7ba52f2667c8862a75af2e"}, +] + +[package.extras] +dev = ["black (==22.3)", "datasets", "numpy", "pytest", "requests"] +docs = ["setuptools-rust", "sphinx", "sphinx-rtd-theme"] +testing = ["black (==22.3)", "datasets", "numpy", "pytest", "requests"] + [[package]] name = "tomli" version = "2.0.1" @@ -3357,6 +3897,83 @@ files = [ {file = "tomlkit-0.12.1.tar.gz", hash = "sha256:38e1ff8edb991273ec9f6181244a6a391ac30e9f5098e7535640ea6be97a7c86"}, ] +[[package]] +name = "torch" +version = "2.0.1" +description = "Tensors and Dynamic neural networks in Python with strong GPU acceleration" +optional = false +python-versions = ">=3.8.0" +files = [ + {file = "torch-2.0.1-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:8ced00b3ba471856b993822508f77c98f48a458623596a4c43136158781e306a"}, + {file = "torch-2.0.1-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:359bfaad94d1cda02ab775dc1cc386d585712329bb47b8741607ef6ef4950747"}, + {file = "torch-2.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:7c84e44d9002182edd859f3400deaa7410f5ec948a519cc7ef512c2f9b34d2c4"}, + {file = "torch-2.0.1-cp310-none-macosx_10_9_x86_64.whl", hash = "sha256:567f84d657edc5582d716900543e6e62353dbe275e61cdc36eda4929e46df9e7"}, + {file = "torch-2.0.1-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:787b5a78aa7917465e9b96399b883920c88a08f4eb63b5a5d2d1a16e27d2f89b"}, + {file = "torch-2.0.1-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:e617b1d0abaf6ced02dbb9486803abfef0d581609b09641b34fa315c9c40766d"}, + {file = "torch-2.0.1-cp311-cp311-manylinux2014_aarch64.whl", hash = "sha256:b6019b1de4978e96daa21d6a3ebb41e88a0b474898fe251fd96189587408873e"}, + {file = "torch-2.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:dbd68cbd1cd9da32fe5d294dd3411509b3d841baecb780b38b3b7b06c7754434"}, + {file = "torch-2.0.1-cp311-none-macosx_10_9_x86_64.whl", hash = "sha256:ef654427d91600129864644e35deea761fb1fe131710180b952a6f2e2207075e"}, + {file = "torch-2.0.1-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:25aa43ca80dcdf32f13da04c503ec7afdf8e77e3a0183dd85cd3e53b2842e527"}, + {file = "torch-2.0.1-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:5ef3ea3d25441d3957348f7e99c7824d33798258a2bf5f0f0277cbcadad2e20d"}, + {file = "torch-2.0.1-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:0882243755ff28895e8e6dc6bc26ebcf5aa0911ed81b2a12f241fc4b09075b13"}, + {file = "torch-2.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:f66aa6b9580a22b04d0af54fcd042f52406a8479e2b6a550e3d9f95963e168c8"}, + {file = "torch-2.0.1-cp38-none-macosx_10_9_x86_64.whl", hash = "sha256:1adb60d369f2650cac8e9a95b1d5758e25d526a34808f7448d0bd599e4ae9072"}, + {file = "torch-2.0.1-cp38-none-macosx_11_0_arm64.whl", hash = "sha256:1bcffc16b89e296826b33b98db5166f990e3b72654a2b90673e817b16c50e32b"}, + {file = "torch-2.0.1-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:e10e1597f2175365285db1b24019eb6f04d53dcd626c735fc502f1e8b6be9875"}, + {file = "torch-2.0.1-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:423e0ae257b756bb45a4b49072046772d1ad0c592265c5080070e0767da4e490"}, + {file = "torch-2.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:8742bdc62946c93f75ff92da00e3803216c6cce9b132fbca69664ca38cfb3e18"}, + {file = "torch-2.0.1-cp39-none-macosx_10_9_x86_64.whl", hash = "sha256:c62df99352bd6ee5a5a8d1832452110435d178b5164de450831a3a8cc14dc680"}, + {file = "torch-2.0.1-cp39-none-macosx_11_0_arm64.whl", hash = "sha256:671a2565e3f63b8fe8e42ae3e36ad249fe5e567435ea27b94edaa672a7d0c416"}, +] + +[package.dependencies] +filelock = "*" +jinja2 = "*" +networkx = "*" +sympy = "*" +typing-extensions = "*" + +[package.extras] +opt-einsum = ["opt-einsum (>=3.3)"] + +[[package]] +name = "torchvision" +version = "0.15.2" +description = "image and video datasets and models for torch deep learning" +optional = false +python-versions = ">=3.8" +files = [ + {file = "torchvision-0.15.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:7754088774e810c5672b142a45dcf20b1bd986a5a7da90f8660c43dc43fb850c"}, + {file = "torchvision-0.15.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:37eb138e13f6212537a3009ac218695483a635c404b6cc1d8e0d0d978026a86d"}, + {file = "torchvision-0.15.2-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:54143f7cc0797d199b98a53b7d21c3f97615762d4dd17ad45a41c7e80d880e73"}, + {file = "torchvision-0.15.2-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:1eefebf5fbd01a95fe8f003d623d941601c94b5cec547b420da89cb369d9cf96"}, + {file = "torchvision-0.15.2-cp310-cp310-win_amd64.whl", hash = "sha256:96fae30c5ca8423f4b9790df0f0d929748e32718d88709b7b567d2f630c042e3"}, + {file = "torchvision-0.15.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5f35f6bd5bcc4568e6522e4137fa60fcc72f4fa3e615321c26cd87e855acd398"}, + {file = "torchvision-0.15.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:757505a0ab2be7096cb9d2bf4723202c971cceddb72c7952a7e877f773de0f8a"}, + {file = "torchvision-0.15.2-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:012ad25cfd9019ff9b0714a168727e3845029be1af82296ff1e1482931fa4b80"}, + {file = "torchvision-0.15.2-cp311-cp311-manylinux2014_aarch64.whl", hash = "sha256:b02a7ffeaa61448737f39a4210b8ee60234bda0515a0c0d8562f884454105b0f"}, + {file = "torchvision-0.15.2-cp311-cp311-win_amd64.whl", hash = "sha256:10be76ceded48329d0a0355ac33da131ee3993ff6c125e4a02ab34b5baa2472c"}, + {file = "torchvision-0.15.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:8f12415b686dba884fb086f53ac803f692be5a5cdd8a758f50812b30fffea2e4"}, + {file = "torchvision-0.15.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:31211c01f8b8ec33b8a638327b5463212e79a03e43c895f88049f97af1bd12fd"}, + {file = "torchvision-0.15.2-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:c55f9889e436f14b4f84a9c00ebad0d31f5b4626f10cf8018e6c676f92a6d199"}, + {file = "torchvision-0.15.2-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:9a192f2aa979438f23c20e883980b23d13268ab9f819498774a6d2eb021802c2"}, + {file = "torchvision-0.15.2-cp38-cp38-win_amd64.whl", hash = "sha256:c07071bc8d02aa8fcdfe139ab6a1ef57d3b64c9e30e84d12d45c9f4d89fb6536"}, + {file = "torchvision-0.15.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:4790260fcf478a41c7ecc60a6d5200a88159fdd8d756e9f29f0f8c59c4a67a68"}, + {file = "torchvision-0.15.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:987ab62225b4151a11e53fd06150c5258ced24ac9d7c547e0e4ab6fbca92a5ce"}, + {file = "torchvision-0.15.2-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:63df26673e66cba3f17e07c327a8cafa3cce98265dbc3da329f1951d45966838"}, + {file = "torchvision-0.15.2-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:b85f98d4cc2f72452f6792ab4463a3541bc5678a8cdd3da0e139ba2fe8b56d42"}, + {file = "torchvision-0.15.2-cp39-cp39-win_amd64.whl", hash = "sha256:07c462524cc1bba5190c16a9d47eac1fca024d60595a310f23c00b4ffff18b30"}, +] + +[package.dependencies] +numpy = "*" +pillow = ">=5.3.0,<8.3.dev0 || >=8.4.dev0" +requests = "*" +torch = "2.0.1" + +[package.extras] +scipy = ["scipy"] + [[package]] name = "tqdm" version = "4.66.1" @@ -3377,6 +3994,75 @@ notebook = ["ipywidgets (>=6)"] slack = ["slack-sdk"] telegram = ["requests"] +[[package]] +name = "transformers" +version = "4.32.1" +description = "State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow" +optional = false +python-versions = ">=3.8.0" +files = [ + {file = "transformers-4.32.1-py3-none-any.whl", hash = "sha256:b930d3dbd907a3f300cf49e54d63a56f8a0ab16b01a2c2a61ecff37c6de1da08"}, + {file = "transformers-4.32.1.tar.gz", hash = "sha256:1edc8ae1de357d97c3d36b04412aa63d55e6fc0c4b39b419a7d380ed947d2252"}, +] + +[package.dependencies] +filelock = "*" +huggingface-hub = ">=0.15.1,<1.0" +numpy = ">=1.17" +packaging = ">=20.0" +pyyaml = ">=5.1" +regex = "!=2019.12.17" +requests = "*" +safetensors = ">=0.3.1" +tokenizers = ">=0.11.1,<0.11.3 || >0.11.3,<0.14" +tqdm = ">=4.27" + +[package.extras] +accelerate = ["accelerate (>=0.20.3)"] +agents = ["Pillow (<10.0.0)", "accelerate (>=0.20.3)", "datasets (!=2.5.0)", "diffusers", "opencv-python", "sentencepiece (>=0.1.91,!=0.1.92)", "torch (>=1.9,!=1.12.0)"] +all = ["Pillow (<10.0.0)", "accelerate (>=0.20.3)", "av (==9.2.0)", "codecarbon (==1.2.0)", "decord (==0.6.0)", "flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1)", "librosa", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "phonemizer", "protobuf", "pyctcdecode (>=0.4.0)", "ray[tune]", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "tensorflow (>=2.6,<2.14)", "tensorflow-text (<2.14)", "tf2onnx", "timm", "tokenizers (>=0.11.1,!=0.11.3,<0.14)", "torch (>=1.9,!=1.12.0)", "torchaudio", "torchvision"] +audio = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"] +codecarbon = ["codecarbon (==1.2.0)"] +deepspeed = ["accelerate (>=0.20.3)", "deepspeed (>=0.9.3)"] +deepspeed-testing = ["GitPython (<3.1.19)", "accelerate (>=0.20.3)", "beautifulsoup4", "black (>=23.1,<24.0)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "deepspeed (>=0.9.3)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "hf-doc-builder (>=0.3.0)", "nltk", "optuna", "parameterized", "protobuf", "psutil", "pytest (>=7.2.0)", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "timeout-decorator"] +dev = ["GitPython (<3.1.19)", "Pillow (<10.0.0)", "accelerate (>=0.20.3)", "av (==9.2.0)", "beautifulsoup4", "black (>=23.1,<24.0)", "codecarbon (==1.2.0)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "decord (==0.6.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "flax (>=0.4.1,<=0.7.0)", "fugashi (>=1.0)", "hf-doc-builder", "hf-doc-builder (>=0.3.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1)", "librosa", "nltk", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pytest (>=7.2.0)", "pytest-timeout", "pytest-xdist", "ray[tune]", "rhoknp (>=1.1.0,<1.3.1)", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (>=0.0.241,<=0.0.259)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorflow (>=2.6,<2.14)", "tensorflow-text (<2.14)", "tf2onnx", "timeout-decorator", "timm", "tokenizers (>=0.11.1,!=0.11.3,<0.14)", "torch (>=1.9,!=1.12.0)", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"] +dev-tensorflow = ["GitPython (<3.1.19)", "Pillow (<10.0.0)", "beautifulsoup4", "black (>=23.1,<24.0)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "hf-doc-builder", "hf-doc-builder (>=0.3.0)", "isort (>=5.5.4)", "kenlm", "keras-nlp (>=0.3.1)", "librosa", "nltk", "onnxconverter-common", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pytest (>=7.2.0)", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (>=0.0.241,<=0.0.259)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorflow (>=2.6,<2.14)", "tensorflow-text (<2.14)", "tf2onnx", "timeout-decorator", "tokenizers (>=0.11.1,!=0.11.3,<0.14)", "urllib3 (<2.0.0)"] +dev-torch = ["GitPython (<3.1.19)", "Pillow (<10.0.0)", "accelerate (>=0.20.3)", "beautifulsoup4", "black (>=23.1,<24.0)", "codecarbon (==1.2.0)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "fugashi (>=1.0)", "hf-doc-builder", "hf-doc-builder (>=0.3.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "kenlm", "librosa", "nltk", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pytest (>=7.2.0)", "pytest-timeout", "pytest-xdist", "ray[tune]", "rhoknp (>=1.1.0,<1.3.1)", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (>=0.0.241,<=0.0.259)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "timeout-decorator", "timm", "tokenizers (>=0.11.1,!=0.11.3,<0.14)", "torch (>=1.9,!=1.12.0)", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"] +docs = ["Pillow (<10.0.0)", "accelerate (>=0.20.3)", "av (==9.2.0)", "codecarbon (==1.2.0)", "decord (==0.6.0)", "flax (>=0.4.1,<=0.7.0)", "hf-doc-builder", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1)", "librosa", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "phonemizer", "protobuf", "pyctcdecode (>=0.4.0)", "ray[tune]", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "tensorflow (>=2.6,<2.14)", "tensorflow-text (<2.14)", "tf2onnx", "timm", "tokenizers (>=0.11.1,!=0.11.3,<0.14)", "torch (>=1.9,!=1.12.0)", "torchaudio", "torchvision"] +docs-specific = ["hf-doc-builder"] +fairscale = ["fairscale (>0.3)"] +flax = ["flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "optax (>=0.0.8,<=0.1.4)"] +flax-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"] +ftfy = ["ftfy"] +integrations = ["optuna", "ray[tune]", "sigopt"] +ja = ["fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "rhoknp (>=1.1.0,<1.3.1)", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)"] +modelcreation = ["cookiecutter (==1.7.3)"] +natten = ["natten (>=0.14.6)"] +onnx = ["onnxconverter-common", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "tf2onnx"] +onnxruntime = ["onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)"] +optuna = ["optuna"] +quality = ["GitPython (<3.1.19)", "black (>=23.1,<24.0)", "datasets (!=2.5.0)", "hf-doc-builder (>=0.3.0)", "isort (>=5.5.4)", "ruff (>=0.0.241,<=0.0.259)", "urllib3 (<2.0.0)"] +ray = ["ray[tune]"] +retrieval = ["datasets (!=2.5.0)", "faiss-cpu"] +sagemaker = ["sagemaker (>=2.31.0)"] +sentencepiece = ["protobuf", "sentencepiece (>=0.1.91,!=0.1.92)"] +serving = ["fastapi", "pydantic (<2)", "starlette", "uvicorn"] +sigopt = ["sigopt"] +sklearn = ["scikit-learn"] +speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)", "torchaudio"] +testing = ["GitPython (<3.1.19)", "beautifulsoup4", "black (>=23.1,<24.0)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "hf-doc-builder (>=0.3.0)", "nltk", "parameterized", "protobuf", "psutil", "pytest (>=7.2.0)", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "timeout-decorator"] +tf = ["keras-nlp (>=0.3.1)", "onnxconverter-common", "tensorflow (>=2.6,<2.14)", "tensorflow-text (<2.14)", "tf2onnx"] +tf-cpu = ["keras-nlp (>=0.3.1)", "onnxconverter-common", "tensorflow-cpu (>=2.6,<2.14)", "tensorflow-text (<2.14)", "tf2onnx"] +tf-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"] +timm = ["timm"] +tokenizers = ["tokenizers (>=0.11.1,!=0.11.3,<0.14)"] +torch = ["accelerate (>=0.20.3)", "torch (>=1.9,!=1.12.0)"] +torch-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)", "torchaudio"] +torch-vision = ["Pillow (<10.0.0)", "torchvision"] +torchhub = ["filelock", "huggingface-hub (>=0.15.1,<1.0)", "importlib-metadata", "numpy (>=1.17)", "packaging (>=20.0)", "protobuf", "regex (!=2019.12.17)", "requests", "sentencepiece (>=0.1.91,!=0.1.92)", "tokenizers (>=0.11.1,!=0.11.3,<0.14)", "torch (>=1.9,!=1.12.0)", "tqdm (>=4.27)"] +video = ["av (==9.2.0)", "decord (==0.6.0)"] +vision = ["Pillow (<10.0.0)"] + [[package]] name = "trio" version = "0.22.2" @@ -3546,13 +4232,13 @@ colorama = {version = ">=0.4.6", markers = "sys_platform == \"win32\" and python [[package]] name = "weaviate-client" -version = "3.23.0" +version = "3.23.2" description = "A python native Weaviate client" optional = false python-versions = ">=3.8" files = [ - {file = "weaviate-client-3.23.0.tar.gz", hash = "sha256:3ffd7f1460c9e32755d84d4f5fc63dfc0bd990dbe2c3dc20d5c68119d467680e"}, - {file = "weaviate_client-3.23.0-py3-none-any.whl", hash = "sha256:3d3bb75c1d96b2b71e213c5eb885ae3e3f42e4304955383c467d100187d9ff8e"}, + {file = "weaviate-client-3.23.2.tar.gz", hash = "sha256:1c8c94df032dd2fa5a4ea615fc69ccb983ffad5cc02974f78c793839e61ac150"}, + {file = "weaviate_client-3.23.2-py3-none-any.whl", hash = "sha256:88ffc38cca07806d64726cc74bc194c7da50b222aa4e2cd129f4c1f5e53e9b61"}, ] [package.dependencies] @@ -3780,4 +4466,4 @@ multidict = ">=4.0" [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "5629225437c5aec01f9f862d46d6d1e68abde4c42a0c1ad709df875883171991" +content-hash = "761b58204631452d77e13bbc2d61034704e8e109619db4addd26ec159b9bb176" diff --git a/level_2/pyproject.toml b/level_2/pyproject.toml index 5a252ba37..9f4e66302 100644 --- a/level_2/pyproject.toml +++ b/level_2/pyproject.toml @@ -40,6 +40,7 @@ weaviate-client = "^3.22.1" python-multipart = "^0.0.6" deep-translator = "^1.11.4" humanize = "^4.8.0" +deepeval = "^0.10.12" diff --git a/level_2/tests/__pycache__/crud_test.cpython-311-pytest-7.4.0.pyc b/level_2/tests/__pycache__/crud_test.cpython-311-pytest-7.4.0.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e540734a9b419ee4bf52515005424c9581ad8bfb GIT binary patch literal 4301 zcmds4O>7%Q6rTODy>T{4<0dY}203XAn6?fLG*F;{5@?Bv)2eNdFok97-B73gV|JaA zkVpn18W0jlfWm=;4^?PY4hSK{vEo!D4(n*8SSwYE#HHMf!U1vO&8&B0I{^YhfS8?q z`{w7pH}huRH#?tpboddZZ_nORyd4OAPYO=*)S30~fLTHcQkV=%GW5%4xfIKwET8l+ zc8fRZr7gaskA8)upbArMj2VI zA-$n(%H^BKIfo=V8xTOck5D@)%x5j$f}TV_O>=0qxQ&YXrJdVvP z12g5uE$+b3{oplUQB$#$R$@NOCu=jgR61{UzO3q!I<4k(DU;6^ES^wI=5Kkk)X7!= z7jsfEr>8SgDxWQ6R9&_FR9;Hw(mE*SH8nG-5$8mHd3!6fPxYrEsg_`!NAimqSDh#X$1LZT9pI?nWvJ!oy5B$-@MRP>&CDZ(1H+y_GCar3q0jhb*izTrZS^Ju`fJ>?Xo-7^ zXHb(4w~sq=V2F<2)-cb`v5IH6E1-BAcwYlAH1KU3{5()WZ%(Zx#Kp+H~f7}u^nFNSbkg=S#JkxTfsLlrM%7~_BWr(6w#EvTh zW3-=U@rta=7I#e5t&tbAm7O$(X>M<`CNufuQ6-9ECN%bHN zN|UOdI=`u2kTnQLeJp3~k`$H1?xrs7c9-!y^t$TByNHpF`b6s5eN?%sSYn-lV0~4$ zM0rvNM6vbaE)diRN<~O2VFKz_w>!{_d3<3qlfO87ydkhj%+vYUqu+Xj@kcZHl$_B< z00(RrN{%vOLxXb%{d^1E0$`{H2g)xSz467P)$jw0iJBMf-1Xt!)xgk7VCeJ1H-biB zs1i7722L8{7I`fk`Q-R&cw{9!^7Z_U$x3*_3;VOjX?RUgDj^$NR5@E8?&z4jba|S|CWqad0JYuo8IC3_NIvTjaIA z=;hFA-{?x;==Fqg`ngJ9((Fqbq1H@otcu|kG5qlhSDvYehfML1;ba^DuUutuJA5!X zgkvmS9DEn?7THlRMeNmyZp;6(_x;%wqg&xgF!XZ?)!5FqhihmH50be7+za~=2Z7AK_?n+$)bEh~ zIsjf!XZjzGI3)_`{eK`lA7icd{5gQ#X<1L_a}-hWZNxe`3-RrQkSN3)C36dL#-r5==%KLz9hGVy@gP_}hV^_vL4L0D|ZE%$BAvXP6!Eudc@7M!? z>~8?%ujh@4=P4qeH3Mf2af{r9$dz!y3@40WYyOAuxYBpZ>^o(ITJtu*7~c(!aZq@W zIB6Rw?0VOB`!-OxMWUY=a#45(LE$aPz@I?j@tD^VBq=9nRY|h^l9bIWMfl&5EK!nP zDaskU1$TpGd=DY_0lA7OgwrDjlYc5o1YooyKuYdY<2lb`HRKY1w+F`TlDl@2;$~uN zCyL47vT`~nNwIEAI9E((^mI6=uk}7q|a=AVT97 z2Vwh|#FkKm3lAw^a!a;2Z3cec3bKCQ^2&vRnp3QJ{bJ*MrGhdkO~V~1k2QF~RZ2#O zdsI%xGF*)8o0dS%*HIY|gL>xx(an>2;0J)+aNaM%qqf6G@F+AuoFVN?AT^F*m};XZ zH+)LDt`dE!sIx?$HRLbRr;4}*;ZoNdUGMW1)N7(%1NE+ rkEG$Jjn<E?;6>hh|?XL3OrKcAXub)^rQKC-xRZItxVd+$A($Wm4#MT*)>dUHfiIrZ%tBdC(j=gqvC_n$X2Kex8V z5R|9Cy~TI?5&FwV8b#Tg>>>%FM+hS5vebnI`R zX(la+BRXS*a&1mGcuBW3!*!U+IB|?J?5a13?b5P{e=&P)W^ziKoSo99uYD_!(NOA- zVd(dv@8$qZLj)kc3YIuvjeVzJEC}3r9y;vidT1^5bx z+D{t@M%Rpp-X@HH1&0FhkCb54OuQilazbq;lttz`zOYfzqK_8FjB$LHoSQqp8vWc|a0afrM94R%Zm-71 za%1BoxsOM3qrv>_NbbVO_-MLWM3xC0d{4PnbS={;i}paD=FVy0 zz_Mu&q0YI~*13o?V}aPZ){qkg)L3FATZG*ehfom#0mNZM#Fur;#3nCmjO&z(mVKN! zSje+iKNE6^TB{cu{;sU7WXytDuuM1O()lbYxb70m-f$i4I@wQk%Q`uhbLRkjV_G;< z#B*txcEOnlmANGfMk+;}F9^9{>1kOsy9M8Po%`;#S+0)ZEEwV3bmo~*i#pY9mR6|$ zw1}9_%8sF#j>$!*reh40w(fA#(Cqph3PXq-o>plT=xI}+w!TGA_$$Z#m3Ci^wx}CR zniZJMR9GGe%DZb>KP^~xS!a57mb#0?;LM-cgfC1^jnpq|)*{Qq(#EnJ@`z>k%?Qv; zu`E*a1mdi7q8ST1pD%LFE%9QBub#AZ)5%O&uAy7(a^?eM{(Hs#1n>sx26~l9J&9Ek z!#j!LYGQaxeI4(78Sj4)@85|JR^x+}=zChN9d#e`dQW+Mm%M>X-bcM&-wCh#l-Jeo zrC@%>>pfjdwIpLT1Yk?8^`ee$FPZY%Z`2}6FDwJtYOBT2@zXB{CteIr{Bf>QxKl&u zo-`S%3{Jd7==hb;KLLc*-YcR1hm~met7vN_-nSF&uSWYTjkXJ3qO|0{lQQFQA?X?4 zdH-=4plN_Y@k2&OVCp~e>;iy#SZbkT;ei?g@awH$`29pMd=SuK;H3iqK#iXwBC2V= zbWKZliD;o@T99~5w0u%5Yx`J_RxJAmo2ftJkf|9;R;bB|Xxd*Ak;YmKn0<0hry}G+ z61zp>=%yILgcJ$fuMDITH4T<|&woJuXHBTsh2LRWGzXcwhh+A0t}$wGx8Q~w+@%aU z(KKvv|8LYMBd8u~-3Y3OrYncrfvx0*T1Ckn)LBKH71ZgW z$;#p8C5}8uKJ2L+9scFZokXsh$Ze>b@mjo3g45pz--VFWT|udzNCge;prI-nf=LZN zGNol}^@ZG1k$XHPxiRC{*Gk*Q_08!A$^+#G^`W|>q^e4)(rC2^QjP$55Q%>Q>Mnj; literal 0 HcmV?d00001 diff --git a/level_2/tests/base_test_set.json b/level_2/tests/base_test_set.json new file mode 100644 index 000000000..1f5dab926 --- /dev/null +++ b/level_2/tests/base_test_set.json @@ -0,0 +1,38 @@ + { + "q1": { + "question": "What does Buck learn from being harnessed and made to work with François?", + "answer": "Buck learns several important lessons from being harnessed and made to work with François. First, he learns the lesson of obedience and the consequences of disobedience. He quickly realizes that François demands instant obedience and uses his whip to enforce it, and that he must listen and follow commands promptly to avoid the whip [16]. Second, Buck learns the value of hard work and the satisfaction it brings. Despite the new and strange nature of the work, Buck learns quickly and makes remarkable progress, earning respect from François [16][19]. Furthermore, Buck learns important skills related to pulling a sled, such as stopping at command ('ho'), going ahead ('mush'), swinging wide on bends, and staying clear of the wheeler when the sled goes downhill. He also learns the dynamics of working in a team and how to respond to the lead dog's instructions [16][19]." + }, + "q2": { + "question": "How many chapters does the document have", + "answer": "The document has a total of 7 chapters. The chapter titles are: Into the Primitive [Chapter 1012], The Law of Club and Fang [Chapter 1013], The Dominant Primordial Beast [Chapter 1014], Who Has Won to Mastership [Chapter 1015], The Toil of Trace and Trail [Chapter 1016], For the Love of a Man [Chapter 1017], The Sounding of the Call [Chapter 1018]." + }, + "q3": { + "question": "Who kidnapped Buck?", + "answer": "Buck was kidnapped by one of the gardener's helpers named Manuel, who sold him to strangers for a profit." + }, + "q4": { + "question": "What is the name of the gardener's helper who kidnapped Buck?", + "answer": "The name of the gardener's helper who kidnapped Buck is Manuel." + }, + "q5": { + "question": "Where was Buck taken after being kidnapped?", + "answer": "After being kidnapped, Buck was taken by Manuel through the orchard to a little flag station known as College Park [7a]. Eventually, Buck was thrown into a baggage car of a train, where he remained unconscious until he woke up and watched the man in the red sweater [8] [11a]. The specific location Buck was taken to after being kidnapped is not explicitly mentioned in the given texts." + }, + "q6": { + "question": "What is the law of club and fang?", + "answer": "The law of club and fang refers to the harsh and primal rules of survival in the wild, where physical strength and aggression determine dominance and power. The club represents the power wielded by humans over animals. Buck realizes that in order to survive in this new environment, he must adapt and submit to this law. The law of club and fang signifies the brutal and primitive nature of life in the wild." + }, + "q7": { + "question": "What is the mother of Buck?", + "answer": "The mother of Buck, the dog in the story 'The Call of the Wild' by Jack London, is a Scottish shepherd dog named Shep." + }, + "q8": { + "question": "How did Buck feel after being kidnapped?", + "answer": "After being kidnapped, Buck felt anger and resentment towards his captors [7]. He was initially cooperative but grew angry as he was mistreated and felt violated and vilely treated during his transportation [8]." + }, + "q9": { + "question": "Was Buck beaten in captivity?", + "answer": "Yes, Buck was beaten while in captivity. In document snippet [11], Buck rushed at the man who had been tormenting him, and the man delivered a blow that rendered Buck senseless." + } + } \ No newline at end of file diff --git a/level_2/tests/crud_test.py b/level_2/tests/crud_test.py index 403273481..38165d39b 100644 --- a/level_2/tests/crud_test.py +++ b/level_2/tests/crud_test.py @@ -74,3 +74,5 @@ class TestMemory(unittest.TestCase): if __name__ == '__main__': unittest.main() + + diff --git a/level_2/tests/semantic_tests.py b/level_2/tests/semantic_tests.py new file mode 100644 index 000000000..ba957dae7 --- /dev/null +++ b/level_2/tests/semantic_tests.py @@ -0,0 +1,119 @@ +import os +import openai +from deepeval.metrics.factual_consistency import assert_factual_consistency +import dotenv +dotenv.load_dotenv() +from level_2.level_2_pdf_vectorstore__dlt_contracts import Memory +openai.api_key = os.getenv("OPENAI_API_KEY", "") +from deepeval.metrics.overall_score import assert_overall_score +import json +from deepeval.metrics.overall_score import OverallScoreMetric + +# Write a sample ChatGPT function + + +async def main(): + async def generate_context(query: str='bla', context:str=None): + memory = Memory(user_id="TestUser") + + await memory.async_init() + + + memory_loaded = await memory._fetch_semantic_memory(observation=query, params=None) + + if memory_loaded: + return memory_loaded["data"]["Get"]["SEMANTICMEMORY"][0]["text"] + else: + params = { + "version": "1.0", + "agreement_id": "AG123456", + "privacy_policy": "https://example.com/privacy", + "terms_of_service": "https://example.com/terms", + "format": "json", + "schema_version": "1.1", + "checksum": "a1b2c3d4e5f6", + "owner": "John Doe", + "license": "MIT", + "validity_start": "2023-08-01", + "validity_end": "2024-07-31", + } + loader_settings = { + "format": "PDF", + "source": "url", + "path": "https://www.ibiblio.org/ebooks/London/Call%20of%20Wild.pdf" + } + load_jack_london = await memory._add_semantic_memory(observation = query, loader_settings=loader_settings, params=params) + memory_loaded = await memory._fetch_semantic_memory(observation=query, params=None) + return memory_loaded["data"]["Get"]["SEMANTICMEMORY"][0]["text"] + + # return load_jack_london + # + # modulator = {"relevance": 0.0, "saliency": 0.0, "frequency": 0.0} + # # # + # run_main_buffer = await memory._create_buffer_context( + # user_input="I want to know how does Buck adapt to life in the wild and then have that info translated to german ", + # params=params, + # attention_modulators=modulator, + # ) + + async def generate_chatgpt_output(query:str, context:str=None): + if context is None: + context = await generate_context(query=query) + # print(context) + else: + pass + response = openai.ChatCompletion.create( + model="gpt-3.5-turbo", + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "assistant", "content": f"{context}"}, + {"role": "user", "content": query} + ] + ) + llm_output = response.choices[0].message.content + # print(llm_output) + return llm_output + + with open('base_test_set.json', 'r') as f: + data = json.load(f) + # + async def test_overall_score(query:str, output:str=None, expected_output:str=None, context:str=None, context_type:str=None): + if context_type == "gpt_search": + context = "" + elif context_type == "base_memory_context": + context = await generate_context(query=query) + output = context + elif context_type == "hybrid_search": + context = await generate_context(query=query) + output = await generate_chatgpt_output(query) + elif context_type == "memory_search": + pass + + metric = OverallScoreMetric() + score = metric.measure( + query=query, + output=output, + expected_output=expected_output, + context=context + ) + print('here is the score', score) + + return score + + # await generate_chatgpt_output(query=" When was call of the wild written?") + scores = {} + for key, item in data.items(): + question = item['question'] + expected_ans = item['answer'] + values = await test_overall_score(query=question, expected_output=expected_ans, context_type="hybrid_search") + scores[key] = values + + print(scores) + + + + +if __name__ == "__main__": + import asyncio + + asyncio.run(main()) \ No newline at end of file From d9a2ee6646c73ee771b01a8b8c5732bdcb7590f0 Mon Sep 17 00:00:00 2001 From: Vasilije <8619304+Vasilije1990@users.noreply.github.com> Date: Sun, 10 Sep 2023 13:10:29 +0200 Subject: [PATCH 2/5] update flow for the docker image --- .../level_2_pdf_vectorstore__dlt_contracts.py | 143 +++++++++++------- level_2/utils.py | 111 ++++++++++++++ 2 files changed, 198 insertions(+), 56 deletions(-) create mode 100644 level_2/utils.py diff --git a/level_2/level_2_pdf_vectorstore__dlt_contracts.py b/level_2/level_2_pdf_vectorstore__dlt_contracts.py index 645de2b2f..ebbf82c0d 100644 --- a/level_2/level_2_pdf_vectorstore__dlt_contracts.py +++ b/level_2/level_2_pdf_vectorstore__dlt_contracts.py @@ -808,63 +808,86 @@ class EpisodicBuffer(BaseMemory): # check if modulators exist, initialize the modulators if needed if attention_modulators is None: - try: - attention_modulators = await self.fetch_memories(observation="Attention modulators", - namespace="BUFFERMEMORY") - lookup_value_episodic = await self.fetch_memories( - observation=str(output), namespace="EPISODICCMEMORY" - ) + # try: + print("Starting with attention mods") + attention_modulators = await self.fetch_memories(observation="Attention modulators", + namespace="BUFFERMEMORY") + + print("Attention modulators exist", str(attention_modulators)) + lookup_value_episodic = await self.fetch_memories( + observation=str(output), namespace="EPISODICMEMORY" + ) + # lookup_value_episodic= lookup_value_episodic["data"]["Get"]["EPISODICMEMORY"][0]["text"] + prompt_classify = ChatPromptTemplate.from_template( + """You are a classifier. Determine if based on the previous query if the user was satisfied with the output : {query}""" + ) + json_structure = [{ + "name": "classifier", + "description": "Classification indicating if it's output is satisfactory", + "parameters": { + "type": "object", + "properties": { + "classification": { + "type": "boolean", + "description": "The classification true or false" + } + }, "required": ["classification"]} + }] + chain_filter = prompt_classify | self.llm.bind(function_call= {"name": "classifier"}, functions= json_structure) + classifier_output = await chain_filter.ainvoke({"query": lookup_value_episodic}) + arguments_str = classifier_output.additional_kwargs['function_call']['arguments'] + print("This is the arguments string", arguments_str) + arguments_dict = json.loads(arguments_str) + classfier_value = arguments_dict.get('classification', None) + + print("This is the classifier value", classfier_value) + + if classfier_value: + # adjust the weights of the modulators by adding a positive value + print("Lookup value, episodic", lookup_value_episodic["data"]["Get"]["EPISODICMEMORY"][0]["text"]) prompt_classify = ChatPromptTemplate.from_template( - """You are a classifier. Determine if based on the previous query if the user was satisfied with the output : {query}""" + """ We know we need to increase the classifiers for our AI system. The classifiers are {modulators} The query is: {query}. Which of the classifiers should we decrease? Return just the modulator and desired value""" ) - json_structure = { - "name": "classifier", - "description": "Classification indicating if it's output is satisfactory", - "type": "boolean", - "required": True - } - chain_filter = prompt_classify | self.llm.bind(function_call= {"name": "classifier"}, functions= json_structure) - classifier_output = await chain_filter.ainvoke({"query": lookup_value_episodic}) - arguments_str = classifier_output.additional_kwargs['function_call']['arguments'] - arguments_dict = json.loads(arguments_str) - classfier_value = arguments_dict.get('classifier', None) + chain_modulator = prompt_classify | self.llm + classifier_output = await chain_modulator.ainvoke({"query": lookup_value_episodic, "modulators": str(attention_modulators)}) + print("classifier output 1", classifier_output) + diff_layer = DifferentiableLayer(attention_modulators) + adjusted_modulator = await diff_layer.adjust_weights(classifier_output) + _input = prompt.format_prompt(query=adjusted_modulator) + document_context_result = self.llm_base(_input.to_string()) + document_context_result_parsed = parser.parse(document_context_result) + print("Updating with the following weights", str(document_context_result_parsed)) + await self.add_memories(observation=str(document_context_result_parsed), params=params, namespace="BUFFERMEMORY") + else: + # adjust the weights of the modulators by adding a negative value + print("Lookup value, episodic", lookup_value_episodic) + prompt_classify = ChatPromptTemplate.from_template( + """ We know we need to decrease the classifiers for our AI system. The classifiers are {modulators} The query is: {query}. Which of the classifiers should we decrease? Return just the modulator and desired value""" + ) + chain_modulator_reduction = prompt_classify | self.llm - if classfier_value: - # adjust the weights of the modulators by adding a positive value - prompt_classify = ChatPromptTemplate.from_template( - """ We know we need to increase the classifiers for our AI system. The classifiers are {modulators} The query is: {query}. Which of the classifiers should we decrease? Return just the modulator and desired value""" - ) - chain_modulator = prompt_classify | self.llm - classifier_output = await chain_modulator.ainvoke({"query": lookup_value_episodic, "modulators": str(attention_modulators)}) - diff_layer = DifferentiableLayer(attention_modulators) - adjusted_modulator = diff_layer.adjust_weights(classifier_output) - _input = prompt.format_prompt(query=adjusted_modulator) - document_context_result = self.llm_base(_input.to_string()) - document_context_result_parsed = parser.parse(document_context_result) - await self.add_memories(observation=document_context_result_parsed, namespace="BUFFERMEMORY") - else: - # adjust the weights of the modulators by adding a negative value - prompt_classify = ChatPromptTemplate.from_template( - """ We know we need to decrease the classifiers for our AI system. The classifiers are {modulators} The query is: {query}. Which of the classifiers should we decrease? Return just the modulator and desired value""" - ) - chain_modulator_reduction = prompt_classify | self.llm - - classifier_output = await chain_modulator_reduction.ainvoke({"query": lookup_value_episodic, "modulators": str(attention_modulators)}) - diff_layer = DifferentiableLayer(attention_modulators) - adjusted_modulator =diff_layer.adjust_weights(classifier_output) - _input = prompt.format_prompt(query=adjusted_modulator) - document_context_result = self.llm_base(_input.to_string()) - document_context_result_parsed = parser.parse(document_context_result) - await self.add_memories(observation=document_context_result_parsed, namespace="BUFFERMEMORY") - except: - # initialize the modulators with default values if they are not provided - print("Starting with default modulators") - attention_modulators = { - "freshness": 0.5, - "frequency": 0.5, - "relevance": 0.5, - "saliency": 0.5, - } + classifier_output = await chain_modulator_reduction.ainvoke({"query": lookup_value_episodic, "modulators": str(attention_modulators)}) + print("classifier output 2", classifier_output) + diff_layer = DifferentiableLayer(attention_modulators) + adjusted_modulator =diff_layer.adjust_weights(classifier_output) + _input = prompt.format_prompt(query=adjusted_modulator) + document_context_result = self.llm_base(_input.to_string()) + document_context_result_parsed = parser.parse(document_context_result) + print("Updating with the following weights", str(document_context_result_parsed)) + await self.add_memories(observation=str(document_context_result_parsed), params=params, namespace="BUFFERMEMORY") + # except: + # # initialize the modulators with default values if they are not provided + # print("Starting with default modulators") + # attention_modulators = { + # "freshness": 0.5, + # "frequency": 0.5, + # "relevance": 0.5, + # "saliency": 0.5, + # } + # _input = prompt.format_prompt(query=attention_modulators) + # document_context_result = self.llm_base(_input.to_string()) + # document_context_result_parsed = parser.parse(document_context_result) + # await self.add_memories(observation=str(document_context_result_parsed), params=params, namespace="BUFFERMEMORY") elif attention_modulators: pass @@ -1140,7 +1163,8 @@ class EpisodicBuffer(BaseMemory): query=user_input, steps=str(tasks_list) , buffer=str(result_tasks), date= date, attention_modulators=attention_modulators ) - + print("HERE ARE THE STEPS, BUFFER AND DATE", str(tasks_list)) + print("here are the result_tasks", str(result_tasks)) # return "a few things to do like load episodic memory in a structured format" output = self.llm_base(_input.to_string()) result_parsing = parser.parse(output) @@ -1373,13 +1397,20 @@ async def main(): # print(load_jack_london) modulator = {"relevance": 0.0, "saliency": 0.0, "frequency": 0.0} - # # + run_main_buffer = await memory._run_main_buffer( user_input="I want to know how does Buck adapt to life in the wild and then have that info translated to german ", params=params, attention_modulators=modulator, ) print(run_main_buffer) + # # + # run_main_buffer = await memory._run_main_buffer( + # user_input="I want to know how does Buck adapt to life in the wild and then have that info translated to german ", + # params=params, + # attention_modulators=None, + # ) + # print(run_main_buffer) # del_semantic = await memory._delete_semantic_memory() # print(del_semantic) diff --git a/level_2/utils.py b/level_2/utils.py new file mode 100644 index 000000000..319f4b500 --- /dev/null +++ b/level_2/utils.py @@ -0,0 +1,111 @@ +import os +from datetime import datetime + +from langchain import PromptTemplate, OpenAI +from langchain.output_parsers import PydanticOutputParser +from pydantic import BaseModel, Field +import dotenv +dotenv.load_dotenv() + +llm_base = OpenAI( + temperature=0.0, + max_tokens=1200, + openai_api_key=os.environ.get("OPENAI_API_KEY"), + model_name="gpt-4-0613", + ) +def _add_to_episodic(user_input, tasks_list, result_tasks, attention_modulators, params): + from level_2.level_2_pdf_vectorstore__dlt_contracts import Memory + memory = Memory(user_id="TestUser") + class EpisodicTask(BaseModel): + """Schema for an individual task.""" + + task_order: str = Field( + ..., description="The order at which the task needs to be performed" + ) + task_name: str = Field( + None, description="The task that needs to be performed" + ) + operation: str = Field(None, description="The operation to be performed") + operation_result: str = Field( + None, description="The result of the operation" + ) + + class EpisodicList(BaseModel): + """Schema for the record containing a list of tasks.""" + + tasks: List[EpisodicTask] = Field(..., description="List of tasks") + start_date: str = Field( + ..., description="The order at which the task needs to be performed" + ) + end_date: str = Field( + ..., description="The order at which the task needs to be performed" + ) + user_query: str = Field( + ..., description="The order at which the task needs to be performed" + ) + attention_modulators: str = Field(..., description="List of attention modulators") + + parser = PydanticOutputParser(pydantic_object=EpisodicList) + date = datetime.now().strftime('%Y-%m-%d %H:%M:%S') + + prompt = PromptTemplate( + template="Format the result.\n{format_instructions}\nOriginal query is: {query}\n Steps are: {steps}, buffer is: {buffer}, date is:{date}, attention modulators are: {attention_modulators} \n", + input_variables=["query", "steps", "buffer", "date", "attention_modulators"], + partial_variables={"format_instructions": parser.get_format_instructions()}, + ) + + _input = prompt.format_prompt( + query=user_input, steps=str(tasks_list) + , buffer=str(result_tasks), date=date, attention_modulators=attention_modulators + ) + + # return "a few things to do like load episodic memory in a structured format" + output = llm_base(_input.to_string()) + result_parsing = parser.parse(output) + lookup_value = await memory.add_memories( + observation=str(result_parsing.json()), params=params, namespace='EPISODICMEMORY' + ) + + +def add_to_buffer(): + pass + + +def delete_from_buffer(): + from level_2.level_2_pdf_vectorstore__dlt_contracts import Memory + memory = Memory(user_id="TestUser") + memory._delete_buffer_memory() + +def delete_from_episodic(): + from level_2.level_2_pdf_vectorstore__dlt_contracts import Memory + memory = Memory(user_id="TestUser") + memory._delete_episodic_memory() + +if __name__ == "__main__": + + params = { + "version": "1.0", + "agreement_id": "AG123456", + "privacy_policy": "https://example.com/privacy", + "terms_of_service": "https://example.com/terms", + "format": "json", + "schema_version": "1.1", + "checksum": "a1b2c3d4e5f6", + "owner": "John Doe", + "license": "MIT", + "validity_start": "2023-08-01", + "validity_end": "2024-07-31", + } + loader_settings = { + "format": "PDF", + "source": "url", + "path": "https://www.ibiblio.org/ebooks/London/Call%20of%20Wild.pdf" + } + modulator = {"relevance": 0.0, "saliency": 0.0, "frequency": 0.0} + user_input = "I want to know how does Buck adapt to life in the wild" + # tasks_list = """tasks": [{"task_order": "1", "task_name": "Fetch Information", "operation": "fetch from vector store", "original_query": "I want to know how does Buck adapt to life in the wild"]""" + out_tasks = """here are the result_tasks [{'task_order': '1', 'task_name': 'Fetch Information', 'operation': 'fetch from vector store', 'original_query': 'I want to know how does Buck adapt to life in the wild'}, {'docs': [{'semantic_search_term': "Buck's adaptation to wild life", 'document_content': 'THE CALL OF THE WILD 30 \nout of his desire for mastery. He was preëminently cunning, and could \nbide his time with a patience that was nothing less than primitive. \nIt was inevitable that the clash for leadership should come. Buck \nwanted it. He wanted it because it was his nature, because he had been \ngripped tight by that nameless, incomprehensible pride of the trail and \ntrace—that pride which holds dogs in the toil to the last gasp, which \nlures them to die joyfully in the harness, and breaks their hearts if they \nare cut out of the harness. This was the pride of Dave as wheel-dog, of \nSol-leks as he pulled with all his strength; the pride that laid hold of \nthem at break of camp, transforming them from sour and sullen brutes \ninto straining, eager, ambitious creatures; the pride that spurred them on \nall day and dropped them at pitch of camp at night, letting them fall back \ninto gloomy unrest and uncontent. This was the pride that bore up Spitz \nand made him thrash the sled-dogs who blundered and shirked in the \ntraces or hid away at harness-up time in the morning. Likewise it was \nthis pride that made him fear Buck as a possible lead-dog. And this was \nBuck’s pride, too. \nHe openly threatened the other’s leadership. He came between him \nand the shirks he should have punished. And he did it deliberately. One \nnight there was a heavy snowfall, and in the morning Pike, the \nmalingerer, did not appear. He was securely hidden in his nest under a \nfoot of snow. François called him and sought him in vain. Spitz was wild \nwith wrath. He raged through the camp, smelling and digging in every \nlikely place, snarling so frightfully that Pike heard and shivered in his \nhiding-place. \nBut when he was at last unearthed, and Spitz flew at him to punish \nhim, Buck flew, with equal rage, in between. So unexpected was it, and \nso shrewdly managed, that Spitz was hurled backward and off his feet. \nPike, who had been trembling abjectly, took heart at this open mutiny, \nand sprang upon his overthrown leader. Buck, to whom fairplay was a \nforgotten code, likewise sprang upon Spitz. But François, chuckling at \nthe incident while unswerving in the administration of justice, brought \nhis lash down upon Buck with all his might. This failed to drive Buck \nfrom his prostrate rival, and the butt of the whip was brought into play. \nHalf-stunned by the blow, Buck was knocked backward and the lash laid', 'document_relevance': '0.75', 'attention_modulators_list': [{'frequency': 'High', 'saliency': 'High', 'relevance': 'High'}]}], 'user_query': 'I want to know how does Buck adapt to life in the wild and then have that info translated to german'}, {'task_order': '2', 'task_name': 'Translate Information', 'operation': 'translate', 'original_query': 'then have that info translated to german'}, 'DER RUF DER WILDNIS 30\naus seinem Wunsch nach Meisterschaft. Er war überaus schlau und konnte es\nwartete seine Zeit mit einer Geduld ab, die geradezu primitiv war.\nEs war unvermeidlich, dass es zu einem Kampf um die Führung kam. Bock\nwollte es. Er wollte es, weil es in seiner Natur lag, weil er es gewesen war\nfestgehalten von diesem namenlosen, unverständlichen Stolz des Weges und\nSpur – dieser Stolz, der Hunde bis zum letzten Atemzug in der Mühsal hält, der\nlockt sie dazu, freudig im Geschirr zu sterben, und bricht ihnen das Herz, wenn sie es tun\nwerden aus dem Kabelbaum herausgeschnitten. Das war der Stolz von Dave als Radhund\nSol-leks, als er mit aller Kraft zog; der Stolz, der mich ergriff\nsie beim Abbruch des Lagers und verwandelte sie in mürrische und mürrische Bestien\nin anstrengende, eifrige, ehrgeizige Wesen; der Stolz, der sie anspornte\nden ganzen Tag und setzte sie nachts auf dem Stellplatz des Lagers ab und ließ sie zurückfallen\nin düstere Unruhe und Unzufriedenheit. Das war der Stolz, der Spitz trug\nund ließ ihn die Schlittenhunde verprügeln, die in der Gegend herumstolperten und sich scheuten\nSpuren hinterlassen oder sich morgens beim Angurten versteckt haben. Ebenso war es\nDieser Stolz ließ ihn Buck als möglichen Leithund fürchten. Und das war\nAuch Bucks Stolz.\nEr bedrohte offen die Führung des anderen. Er kam zwischen ihn\nund die Schirks hätte er bestrafen sollen. Und er hat es absichtlich getan. Eins\nNachts gab es starken Schneefall und am Morgen Pike, der\nSimulant, erschien nicht. Er war sicher in seinem Nest unter einer Decke versteckt\nFuß Schnee. François rief ihn an und suchte ihn vergeblich. Spitz war wild\nmit Zorn. Er tobte durch das Lager, schnupperte und wühlte darin herum\nWahrscheinlicher Ort, er knurrte so schrecklich, dass Pike es hörte und in seinem Kopf zitterte\nVersteck.\nAber als er endlich ausgegraben wurde, flog Spitz auf ihn zu, um ihn zu bestrafen\nihm, Buck flog mit der gleichen Wut dazwischen. So unerwartet war es, und\nEs gelang ihm so geschickt, dass Spitz nach hinten geschleudert wurde und von den Füßen fiel.\nPike, der erbärmlich gezittert hatte, fasste angesichts dieser offenen Meuterei Mut.\nund sprang auf seinen gestürzten Anführer. Buck, für den Fairplay wichtig war\nvergessener Code, sprang ebenfalls Spitz auf. Aber François kicherte\nder Vorfall, während unerschütterlich in der Rechtspflege, gebracht\nEr schlug mit aller Kraft auf Buck ein. Das gelang Buck nicht\nvon seinem am Boden liegenden Rivalen, und der Peitschenkolben wurde ins Spiel gebracht.\nVon dem Schlag halb betäubt, wurde Buck nach hinten geschleudert und mit der Peitsche niedergeschlagen']""" + + # _add_to_episodic(user_input=user_input, result_tasks=out_tasks, modulator=None, params=params) + + From ca5e0905263df0b8ceace8a87eef8f8bca3d1fde Mon Sep 17 00:00:00 2001 From: Vasilije <8619304+Vasilije1990@users.noreply.github.com> Date: Mon, 11 Sep 2023 20:41:42 +0200 Subject: [PATCH 3/5] Updated the code considerably to fix issues with context overloads --- .../level_2_pdf_vectorstore__dlt_contracts.py | 228 ++++++++++++------ level_2/modulators/modulators.py | 0 level_2/utils.py | 81 ++++++- 3 files changed, 222 insertions(+), 87 deletions(-) create mode 100644 level_2/modulators/modulators.py diff --git a/level_2/level_2_pdf_vectorstore__dlt_contracts.py b/level_2/level_2_pdf_vectorstore__dlt_contracts.py index ebbf82c0d..a60c43fb5 100644 --- a/level_2/level_2_pdf_vectorstore__dlt_contracts.py +++ b/level_2/level_2_pdf_vectorstore__dlt_contracts.py @@ -2,8 +2,10 @@ import json from enum import Enum from io import BytesIO -from typing import Dict, List, Union +from typing import Dict, List, Union, Any +import logging +logging.basicConfig(level=logging.INFO) import marvin import requests from deep_translator import GoogleTranslator @@ -290,12 +292,20 @@ class WeaviateVectorDB(VectorDB): ) async def fetch_memories( - self, observation: str, namespace: str, params: dict = None + self, observation: str, namespace: str, params: dict = None, n_of_observations =int(2) ): - # Fetch Weaviate memories here """ Get documents from weaviate. + Parameters: + - observation (str): User query. + - namespace (str): Type of memory we access. + - params (dict, optional): + - n_of_observations (int, optional): For weaviate, equals to autocut, defaults to 1. Ranges from 1 to 3. Check weaviate docs for more info. + + Returns: + Describe the return type and what the function returns. + Args a json containing: query (str): The query string. path (list): The path for filtering, e.g., ['year']. @@ -304,6 +314,7 @@ class WeaviateVectorDB(VectorDB): Example: get_from_weaviate(query="some query", path=['year'], operator='Equal', valueText='2017*') + """ client = self.init_weaviate_client(self.namespace) @@ -349,6 +360,7 @@ class WeaviateVectorDB(VectorDB): ["id", "creationTimeUnix", "lastUpdateTimeUnix", "score",'distance'] ) .with_where(params_user_id) + .with_limit(10) .do() ) return query_output @@ -384,8 +396,9 @@ class WeaviateVectorDB(VectorDB): query=observation, fusion_type=HybridFusion.RELATIVE_SCORE ) - .with_autocut(1) + .with_autocut(n_of_observations) .with_where(params_user_id) + .with_limit(10) .do() ) return query_output @@ -493,11 +506,13 @@ class BaseMemory: observation: str, params: Optional[str] = None, namespace: Optional[str] = None, + n_of_observations: Optional[int] = 2, ): if self.db_type == "weaviate": return await self.vector_db.fetch_memories( observation=observation, params=params, - namespace=namespace + namespace=namespace, + n_of_observations=n_of_observations ) async def delete_memories(self, params: Optional[str] = None): @@ -559,6 +574,34 @@ class EpisodicBuffer(BaseMemory): model_name="gpt-4-0613", ) + async def _summarizer(self, text: str, document:str, max_tokens: int = 1200): + """Summarize text using OpenAI API, to reduce amount of code for modulators contributing to context""" + class Summaries(BaseModel): + """Schema for documentGroups""" + summary: str = Field( + ..., + description="Summarized document") + class SummaryContextList(BaseModel): + """Buffer raw context processed by the buffer""" + + summaries: List[Summaries] = Field(..., description="List of summaries") + observation: str = Field(..., description="The original user query") + + parser = PydanticOutputParser(pydantic_object=SummaryContextList) + prompt = PromptTemplate( + template=" \n{format_instructions}\nSummarize the observation briefly based on the user query, observation is: {query}\n. The document is: {document}", + input_variables=["query", "document"], + partial_variables={"format_instructions": parser.get_format_instructions()}, + ) + + _input = prompt.format_prompt(query=text, document=document) + document_context_result = self.llm_base(_input.to_string()) + document_context_result_parsed = parser.parse(document_context_result) + document_context_result_parsed = json.loads(document_context_result_parsed.json()) + document_summary = document_context_result_parsed["summaries"][0]["summary"] + + return document_summary + async def memory_route(self, text_time_diff: str): @ai_classifier class MemoryRoute(Enum): @@ -575,8 +618,9 @@ class EpisodicBuffer(BaseMemory): return namespace - async def freshness(self, observation: str, namespace: str = None) -> list[str]: + async def freshness(self, observation: str, namespace: str = None, memory=None) -> list[str]: """Freshness - Score between 0 and 1 on how often was the information updated in episodic or semantic memory in the past""" + logging.info("Starting with Freshness") lookup_value = await self.fetch_memories( observation=observation, namespace=namespace @@ -589,13 +633,14 @@ class EpisodicBuffer(BaseMemory): last_update_datetime = datetime.fromtimestamp(int(unix_t) / 1000) time_difference = datetime.now() - last_update_datetime time_difference_text = humanize.naturaltime(time_difference) - namespace = await self.memory_route(str(time_difference_text)) - return [namespace.value, lookup_value] + namespace_ = await self.memory_route(str(time_difference_text)) + return [namespace_.value, lookup_value] - async def frequency(self, observation: str, namespace: str) -> list[str]: + async def frequency(self, observation: str, namespace: str, memory) -> list[str]: """Frequency - Score between 0 and 1 on how often was the information processed in episodic memory in the past Counts the number of times a memory was accessed in the past and divides it by the total number of memories in the episodic memory """ + logging.info("Starting with Frequency") weaviate_client = self.init_client(namespace=namespace) result_output = await self.fetch_memories( @@ -610,19 +655,22 @@ class EpisodicBuffer(BaseMemory): "count" ] ) - return [str(frequency), result_output["data"]["Get"]["EPISODICMEMORY"][0]] + summary = await self._summarizer(text=observation, document=result_output["data"]["Get"]["EPISODICMEMORY"][0]) + logging.info("Frequency summary is %s", str(summary)) + return [str(frequency), summary] - async def repetition(self, observation: str, namespace: str) -> list[str]: + async def repetition(self, observation: str, namespace: str, memory) -> list[str]: """Repetition - Score between 0 and 1 based on how often and at what intervals a memory has been revisited. Accounts for the spacing effect, where memories accessed at increasing intervals are given higher scores. + # TO DO -> add metadata column to make sure that the access is not equal to update, and run update vector function each time a memory is accessed """ - weaviate_client = self.init_client(namespace=namespace) + logging.info("Starting with Repetition") result_output = await self.fetch_memories( observation=observation, params=None, namespace=namespace ) - access_times = result_output["data"]["Get"]["EPISODICMEMORY"][0]["_additional"]["accessTimes"] + access_times = result_output["data"]["Get"]["EPISODICMEMORY"][0]["_additional"]["lastUpdateTimeUnix"] # Calculate repetition score based on access times if not access_times or len(access_times) == 1: return ["0", result_output["data"]["Get"]["EPISODICMEMORY"][0]] @@ -633,13 +681,15 @@ class EpisodicBuffer(BaseMemory): intervals = [access_times[i + 1] - access_times[i] for i in range(len(access_times) - 1)] # A simple scoring mechanism: Longer intervals get higher scores, as they indicate spaced repetition repetition_score = sum([1.0 / (interval + 1) for interval in intervals]) / len(intervals) + summary = await self._summarizer(text = observation, document=result_output["data"]["Get"]["EPISODICMEMORY"][0]) + logging.info("Repetition is %s", str(repetition_score)) + logging.info("Repetition summary is %s", str(summary)) + return [str(repetition_score), summary] - return [str(repetition_score), result_output["data"]["Get"]["EPISODICMEMORY"][0]] - - async def relevance(self, observation: str, namespace: str) -> list[str]: + async def relevance(self, observation: str, namespace: str, memory) -> list[str]: """ - Fetches the relevance score for a given observation from the episodic memory. - + Fetches the fusion relevance score for a given observation from the episodic memory. + Learn more about fusion scores here on Weaviate docs: https://weaviate.io/blog/hybrid-search-fusion-algorithms Parameters: - observation: The user's query or observation. - namespace: The namespace for the data. @@ -647,40 +697,20 @@ class EpisodicBuffer(BaseMemory): Returns: - The relevance score between 0 and 1. """ + logging.info("Starting with Relevance") + score = memory["_additional"]["score"] + logging.info("Relevance is %s", str(score)) + return [score, "fusion score"] - # Fetch the memory content based on the observation - result_output = await self.fetch_memories( - observation=observation, params=None, namespace=namespace - ) - - # Extract the relevance score from the memory content - score = result_output["data"]["Get"]["EPISODICMEMORY"][0]["_additional"]["score"] - - return score - - - #each of the requests is numbered, and then the previous requests are retrieved . The request is classified based on past and current content as : - # 1. Very positive request - # 2. Positive request - # 3. Neutral request - # 4. Negative request - # 5. Very negative request - - - # After this, we update the weights of the request based on the classification of the request. - # After updating the weights, we update the buffer with the new weights. When new weights are calculated, we start from the updated values - # Which chunking strategy works best? - - # Adding to the buffer - process the weights, and then use them as filters - - async def saliency(self, observation: str, namespace=None) -> list[str]: + async def saliency(self, observation: str, namespace=None, memory=None) -> list[str]: """Determines saliency by scoring the set of retrieved documents against each other and trying to determine saliency """ + logging.info("Starting with Saliency") class SaliencyRawList(BaseModel): """Schema for documentGroups""" - original_document: str = Field( + summary: str = Field( ..., - description="The original document retrieved from the database") + description="Summarized document") saliency_score: str = Field( None, description="The score between 0 and 1") class SailencyContextList(BaseModel): @@ -691,7 +721,7 @@ class EpisodicBuffer(BaseMemory): parser = PydanticOutputParser(pydantic_object=SailencyContextList) prompt = PromptTemplate( - template="Determine saliency of documents compared to the other documents retrieved \n{format_instructions}\nOriginal observation is: {query}\n", + template="Determine saliency of documents compared to the other documents retrieved \n{format_instructions}\nSummarize the observation briefly based on the user query, observation is: {query}\n", input_variables=["query"], partial_variables={"format_instructions": parser.get_format_instructions()}, ) @@ -699,7 +729,14 @@ class EpisodicBuffer(BaseMemory): _input = prompt.format_prompt(query=observation) document_context_result = self.llm_base(_input.to_string()) document_context_result_parsed = parser.parse(document_context_result) - return document_context_result_parsed.json() + document_context_result_parsed = json.loads(document_context_result_parsed.json()) + saliency_score = document_context_result_parsed["docs"][0]["saliency_score"] + saliency_values = document_context_result_parsed["docs"][0]["summary"] + + logging.info("Saliency is %s", str(saliency_score)) + logging.info("Saliency summary is %s", str(saliency_values)) + + return [saliency_score, saliency_values] @@ -722,6 +759,7 @@ class EpisodicBuffer(BaseMemory): attention_modulators: Dict[str, float], observation: str, namespace: Optional[str] = None, + memory: Optional[Dict[str, Any]] = None, ) -> Optional[List[Union[str, float]]]: """ Handle the given modulator based on the observation and namespace. @@ -737,22 +775,25 @@ class EpisodicBuffer(BaseMemory): """ modulator_value = attention_modulators.get(modulator_name, 0.0) modulator_functions = { - "freshness": lambda obs, ns: self.freshness(observation=obs, namespace=ns), - "frequency": lambda obs, ns: self.frequency(observation=obs, namespace=ns), - "relevance": lambda obs, ns: self.relevance(observation=obs, namespace=ns), - "saliency": lambda obs, ns: self.saliency(observation=obs, namespace=ns), + "freshness": lambda obs, ns, mem: self.freshness(observation=obs, namespace=ns, memory=mem), + "frequency": lambda obs, ns, mem: self.frequency(observation=obs, namespace=ns, memory=mem), + "relevance": lambda obs, ns, mem: self.relevance(observation=obs, namespace=ns, memory=mem), + "saliency": lambda obs, ns, mem: self.saliency(observation=obs, namespace=ns, memory=mem), } result_func = modulator_functions.get(modulator_name) if not result_func: return None - result = await result_func(observation, namespace) + result = await result_func(observation, namespace, memory) if not result: return None try: - if float(modulator_value) >= float(result[0]): + logging.info("Modulator %s", modulator_name) + logging.info("Modulator value %s", modulator_value) + logging.info("Result %s", result[0]) + if float(result[0]) >= float(modulator_value): return result except ValueError: pass @@ -809,11 +850,11 @@ class EpisodicBuffer(BaseMemory): # check if modulators exist, initialize the modulators if needed if attention_modulators is None: # try: - print("Starting with attention mods") + logging.info("Starting with attention mods") attention_modulators = await self.fetch_memories(observation="Attention modulators", namespace="BUFFERMEMORY") - print("Attention modulators exist", str(attention_modulators)) + logging.info("Attention modulators exist %s", str(attention_modulators)) lookup_value_episodic = await self.fetch_memories( observation=str(output), namespace="EPISODICMEMORY" ) @@ -896,26 +937,52 @@ class EpisodicBuffer(BaseMemory): lookup_value_semantic = await self.fetch_memories( observation=str(output), namespace="SEMANTICMEMORY" ) + print("This is the lookup value semantic", len(lookup_value_semantic)) context = [] - for memory in lookup_value_semantic["data"]["Get"]["SEMANTICMEMORY"]: - # extract memory id, and pass it to fetch function as a parameter + memory_scores = [] + + async def compute_score_for_memory(memory, output, attention_modulators): modulators = list(attention_modulators.keys()) + total_score = 0 + num_scores = 0 + individual_scores = {} # Store individual scores with their modulator names + for modulator in modulators: result = await self.handle_modulator( - modulator, - attention_modulators, - str(output), + modulator_name=modulator, + attention_modulators=attention_modulators, + observation=str(output), namespace="EPISODICMEMORY", + memory=memory, ) if result: - context.append(result) - context.append(memory) + score = float(result[0]) # Assuming the first value in result is the score + individual_scores[modulator] = score # Store the score with its modulator name + total_score += score + num_scores += 1 + average_score = total_score / num_scores if num_scores else 0 + return { + "memory": memory, + "average_score": average_score, + "individual_scores": individual_scores + } + + tasks = [ + compute_score_for_memory(memory=memory, output=output, attention_modulators=attention_modulators) + for memory in lookup_value_semantic["data"]["Get"]["SEMANTICMEMORY"] + ] + + print("HERE IS THE LENGTH OF THE TASKS", str(tasks)) + memory_scores = await asyncio.gather(*tasks) + # Sort the memories based on their average scores + sorted_memories = sorted(memory_scores, key=lambda x: x["average_score"], reverse=True)[:5] + # Store the sorted memories in the context + context.extend([item for item in sorted_memories]) + print("HERE IS THE CONTEXT", context) class BufferModulators(BaseModel): - frequency: str = Field(..., description="Frequency score of the document") - saliency: str = Field(..., description="Saliency score of the document") - relevance: str = Field(..., description="Relevance score of the document") + attention_modulators: Dict[str, float] = Field(... , description="Attention modulators") class BufferRawContextTerms(BaseModel): """Schema for documentGroups""" @@ -927,18 +994,29 @@ class EpisodicBuffer(BaseMemory): document_content: str = Field( None, description="Shortened original content of the document" ) - document_relevance: str = Field( - None, - description="The relevance of the document for the task on the scale from 0 to 1", - ) attention_modulators_list: List[BufferModulators] = Field( ..., description="List of modulators" ) + average_modulator_score: str = Field(None, description="Average modulator score") + class StructuredEpisodicEvents(BaseModel): + """Schema for documentGroups""" + + event_order: str = Field( + ..., + description="Order when event occured", + ) + event_type: str = Field( + None, description="Type of the event" + ) + event_context: List[BufferModulators] = Field( + ..., description="Context of the event" + ) class BufferRawContextList(BaseModel): """Buffer raw context processed by the buffer""" docs: List[BufferRawContextTerms] = Field(..., description="List of docs") + events: List[StructuredEpisodicEvents] = Field(..., description="List of events") user_query: str = Field(..., description="The original user query") # we structure the data here to make it easier to work with @@ -956,6 +1034,7 @@ class EpisodicBuffer(BaseMemory): _input = prompt.format_prompt(query=user_input, context=context) document_context_result = self.llm_base(_input.to_string()) document_context_result_parsed = parser.parse(document_context_result) + # print(document_context_result_parsed) return document_context_result_parsed async def get_task_list( @@ -1373,7 +1452,7 @@ class Memory: async def main(): # if you want to run the script as a standalone script, do so with the examples below - memory = Memory(user_id="123") + memory = Memory(user_id="TestUser") await memory.async_init() params = { "version": "1.0", @@ -1396,9 +1475,10 @@ async def main(): # load_jack_london = await memory._add_semantic_memory(observation = "bla", loader_settings=loader_settings, params=params) # print(load_jack_london) - modulator = {"relevance": 0.0, "saliency": 0.0, "frequency": 0.0} - - run_main_buffer = await memory._run_main_buffer( + modulator = {"relevance": 0.1, "frequency": 0.1} + # await memory._delete_episodic_memory() + # + run_main_buffer = await memory._create_buffer_context( user_input="I want to know how does Buck adapt to life in the wild and then have that info translated to german ", params=params, attention_modulators=modulator, diff --git a/level_2/modulators/modulators.py b/level_2/modulators/modulators.py new file mode 100644 index 000000000..e69de29bb diff --git a/level_2/utils.py b/level_2/utils.py index 319f4b500..0baa766d7 100644 --- a/level_2/utils.py +++ b/level_2/utils.py @@ -1,10 +1,12 @@ import os from datetime import datetime +from typing import List from langchain import PromptTemplate, OpenAI from langchain.output_parsers import PydanticOutputParser from pydantic import BaseModel, Field import dotenv +from level_2.level_2_pdf_vectorstore__dlt_contracts import Memory dotenv.load_dotenv() llm_base = OpenAI( @@ -13,9 +15,11 @@ llm_base = OpenAI( openai_api_key=os.environ.get("OPENAI_API_KEY"), model_name="gpt-4-0613", ) -def _add_to_episodic(user_input, tasks_list, result_tasks, attention_modulators, params): - from level_2.level_2_pdf_vectorstore__dlt_contracts import Memory +async def _add_to_episodic(user_input, tasks_list, result_tasks, attention_modulators, params): + + memory = Memory(user_id="TestUser") + await memory.async_init() class EpisodicTask(BaseModel): """Schema for an individual task.""" @@ -62,27 +66,65 @@ def _add_to_episodic(user_input, tasks_list, result_tasks, attention_modulators, # return "a few things to do like load episodic memory in a structured format" output = llm_base(_input.to_string()) result_parsing = parser.parse(output) - lookup_value = await memory.add_memories( - observation=str(result_parsing.json()), params=params, namespace='EPISODICMEMORY' + lookup_value = await memory._add_episodic_memory( + observation=str(result_parsing.json()), params=params ) -def add_to_buffer(): - pass +async def add_to_buffer(adjusted_modulator=None, params={}): + memory = Memory(user_id="TestUser") + await memory.async_init() + class BufferModulators(BaseModel): + """Value of buffer modulators""" + frequency: str = Field(..., description="Frequency score of the document") + saliency: str = Field(..., description="Saliency score of the document") + relevance: str = Field(..., description="Relevance score of the document") + description: str = Field(..., description="Latest buffer modulators") + direction: str = Field(..., description="Increase or a decrease of the modulator") + + parser = PydanticOutputParser(pydantic_object=BufferModulators) + + prompt = PromptTemplate( + template="""Structure the buffer modulators to be used for the buffer. \n + {format_instructions} \nOriginal observation is: + {query}\n """, + input_variables=["query"], + partial_variables={"format_instructions": parser.get_format_instructions()}, + ) + _input = prompt.format_prompt(query=adjusted_modulator) + document_context_result = llm_base(_input.to_string()) + document_context_result_parsed = parser.parse(document_context_result) + await memory._add_buffer_memory(user_input=str(document_context_result_parsed), params=params) + return document_context_result_parsed.json() -def delete_from_buffer(): +async def delete_from_buffer(): from level_2.level_2_pdf_vectorstore__dlt_contracts import Memory memory = Memory(user_id="TestUser") - memory._delete_buffer_memory() + await memory.async_init() + await memory._delete_buffer_memory() -def delete_from_episodic(): +async def delete_from_episodic(): from level_2.level_2_pdf_vectorstore__dlt_contracts import Memory memory = Memory(user_id="TestUser") - memory._delete_episodic_memory() + await memory.async_init() + await memory._delete_episodic_memory() -if __name__ == "__main__": +async def get_from_episodic(observation=None): + from level_2.level_2_pdf_vectorstore__dlt_contracts import Memory + memory = Memory(user_id="TestUser") + await memory.async_init() + return await memory._fetch_episodic_memory(observation=observation) + +async def get_from_buffer(observation=None): + from level_2.level_2_pdf_vectorstore__dlt_contracts import Memory + memory = Memory(user_id="TestUser") + await memory.async_init() + return await memory._fetch_buffer_memory(user_input=observation) + + +async def main(): params = { "version": "1.0", "agreement_id": "AG123456", @@ -101,11 +143,24 @@ if __name__ == "__main__": "source": "url", "path": "https://www.ibiblio.org/ebooks/London/Call%20of%20Wild.pdf" } - modulator = {"relevance": 0.0, "saliency": 0.0, "frequency": 0.0} + modulator = {"relevance": 1.0, "saliency": 1.0, "frequency": 1.0, "freshness": 1.0, "repetition": 1.0} user_input = "I want to know how does Buck adapt to life in the wild" # tasks_list = """tasks": [{"task_order": "1", "task_name": "Fetch Information", "operation": "fetch from vector store", "original_query": "I want to know how does Buck adapt to life in the wild"]""" out_tasks = """here are the result_tasks [{'task_order': '1', 'task_name': 'Fetch Information', 'operation': 'fetch from vector store', 'original_query': 'I want to know how does Buck adapt to life in the wild'}, {'docs': [{'semantic_search_term': "Buck's adaptation to wild life", 'document_content': 'THE CALL OF THE WILD 30 \nout of his desire for mastery. He was preëminently cunning, and could \nbide his time with a patience that was nothing less than primitive. \nIt was inevitable that the clash for leadership should come. Buck \nwanted it. He wanted it because it was his nature, because he had been \ngripped tight by that nameless, incomprehensible pride of the trail and \ntrace—that pride which holds dogs in the toil to the last gasp, which \nlures them to die joyfully in the harness, and breaks their hearts if they \nare cut out of the harness. This was the pride of Dave as wheel-dog, of \nSol-leks as he pulled with all his strength; the pride that laid hold of \nthem at break of camp, transforming them from sour and sullen brutes \ninto straining, eager, ambitious creatures; the pride that spurred them on \nall day and dropped them at pitch of camp at night, letting them fall back \ninto gloomy unrest and uncontent. This was the pride that bore up Spitz \nand made him thrash the sled-dogs who blundered and shirked in the \ntraces or hid away at harness-up time in the morning. Likewise it was \nthis pride that made him fear Buck as a possible lead-dog. And this was \nBuck’s pride, too. \nHe openly threatened the other’s leadership. He came between him \nand the shirks he should have punished. And he did it deliberately. One \nnight there was a heavy snowfall, and in the morning Pike, the \nmalingerer, did not appear. He was securely hidden in his nest under a \nfoot of snow. François called him and sought him in vain. Spitz was wild \nwith wrath. He raged through the camp, smelling and digging in every \nlikely place, snarling so frightfully that Pike heard and shivered in his \nhiding-place. \nBut when he was at last unearthed, and Spitz flew at him to punish \nhim, Buck flew, with equal rage, in between. So unexpected was it, and \nso shrewdly managed, that Spitz was hurled backward and off his feet. \nPike, who had been trembling abjectly, took heart at this open mutiny, \nand sprang upon his overthrown leader. Buck, to whom fairplay was a \nforgotten code, likewise sprang upon Spitz. But François, chuckling at \nthe incident while unswerving in the administration of justice, brought \nhis lash down upon Buck with all his might. This failed to drive Buck \nfrom his prostrate rival, and the butt of the whip was brought into play. \nHalf-stunned by the blow, Buck was knocked backward and the lash laid', 'document_relevance': '0.75', 'attention_modulators_list': [{'frequency': 'High', 'saliency': 'High', 'relevance': 'High'}]}], 'user_query': 'I want to know how does Buck adapt to life in the wild and then have that info translated to german'}, {'task_order': '2', 'task_name': 'Translate Information', 'operation': 'translate', 'original_query': 'then have that info translated to german'}, 'DER RUF DER WILDNIS 30\naus seinem Wunsch nach Meisterschaft. Er war überaus schlau und konnte es\nwartete seine Zeit mit einer Geduld ab, die geradezu primitiv war.\nEs war unvermeidlich, dass es zu einem Kampf um die Führung kam. Bock\nwollte es. Er wollte es, weil es in seiner Natur lag, weil er es gewesen war\nfestgehalten von diesem namenlosen, unverständlichen Stolz des Weges und\nSpur – dieser Stolz, der Hunde bis zum letzten Atemzug in der Mühsal hält, der\nlockt sie dazu, freudig im Geschirr zu sterben, und bricht ihnen das Herz, wenn sie es tun\nwerden aus dem Kabelbaum herausgeschnitten. Das war der Stolz von Dave als Radhund\nSol-leks, als er mit aller Kraft zog; der Stolz, der mich ergriff\nsie beim Abbruch des Lagers und verwandelte sie in mürrische und mürrische Bestien\nin anstrengende, eifrige, ehrgeizige Wesen; der Stolz, der sie anspornte\nden ganzen Tag und setzte sie nachts auf dem Stellplatz des Lagers ab und ließ sie zurückfallen\nin düstere Unruhe und Unzufriedenheit. Das war der Stolz, der Spitz trug\nund ließ ihn die Schlittenhunde verprügeln, die in der Gegend herumstolperten und sich scheuten\nSpuren hinterlassen oder sich morgens beim Angurten versteckt haben. Ebenso war es\nDieser Stolz ließ ihn Buck als möglichen Leithund fürchten. Und das war\nAuch Bucks Stolz.\nEr bedrohte offen die Führung des anderen. Er kam zwischen ihn\nund die Schirks hätte er bestrafen sollen. Und er hat es absichtlich getan. Eins\nNachts gab es starken Schneefall und am Morgen Pike, der\nSimulant, erschien nicht. Er war sicher in seinem Nest unter einer Decke versteckt\nFuß Schnee. François rief ihn an und suchte ihn vergeblich. Spitz war wild\nmit Zorn. Er tobte durch das Lager, schnupperte und wühlte darin herum\nWahrscheinlicher Ort, er knurrte so schrecklich, dass Pike es hörte und in seinem Kopf zitterte\nVersteck.\nAber als er endlich ausgegraben wurde, flog Spitz auf ihn zu, um ihn zu bestrafen\nihm, Buck flog mit der gleichen Wut dazwischen. So unerwartet war es, und\nEs gelang ihm so geschickt, dass Spitz nach hinten geschleudert wurde und von den Füßen fiel.\nPike, der erbärmlich gezittert hatte, fasste angesichts dieser offenen Meuterei Mut.\nund sprang auf seinen gestürzten Anführer. Buck, für den Fairplay wichtig war\nvergessener Code, sprang ebenfalls Spitz auf. Aber François kicherte\nder Vorfall, während unerschütterlich in der Rechtspflege, gebracht\nEr schlug mit aller Kraft auf Buck ein. Das gelang Buck nicht\nvon seinem am Boden liegenden Rivalen, und der Peitschenkolben wurde ins Spiel gebracht.\nVon dem Schlag halb betäubt, wurde Buck nach hinten geschleudert und mit der Peitsche niedergeschlagen']""" - # _add_to_episodic(user_input=user_input, result_tasks=out_tasks, modulator=None, params=params) + await _add_to_episodic(user_input=user_input, result_tasks=out_tasks, tasks_list=None, attention_modulators=modulator, params=params) + # await delete_from_episodic() + # aa = await get_from_episodic(observation="summary") + # await delete_from_buffer() + modulator_changed = {"relevance": 0.9, "saliency": 0.9, "frequency": 0.9} + await add_to_buffer(adjusted_modulator=modulator_changed) + + # aa = await get_from_buffer(observation="summary") + # print(aa) + +if __name__ == "__main__": + import asyncio + + asyncio.run(main()) From 1cfa76c091d94f8fa86c6c63587b146b7b9fe32a Mon Sep 17 00:00:00 2001 From: Vasilije <8619304+Vasilije1990@users.noreply.github.com> Date: Mon, 11 Sep 2023 21:14:10 +0200 Subject: [PATCH 4/5] Updated the code considerably to fix issues with context overloads --- .../level_2_pdf_vectorstore__dlt_contracts.py | 448 +----------------- level_2/modulators/modulators.py | 32 ++ level_2/vectordb/basevectordb.py | 447 +++++++++++++++++ 3 files changed, 481 insertions(+), 446 deletions(-) create mode 100644 level_2/vectordb/basevectordb.py diff --git a/level_2/level_2_pdf_vectorstore__dlt_contracts.py b/level_2/level_2_pdf_vectorstore__dlt_contracts.py index a60c43fb5..a5a3dd52d 100644 --- a/level_2/level_2_pdf_vectorstore__dlt_contracts.py +++ b/level_2/level_2_pdf_vectorstore__dlt_contracts.py @@ -74,452 +74,8 @@ marvin.settings.openai.api_key = os.environ.get("OPENAI_API_KEY") # Assuming OpenAIEmbeddings and other necessary imports are available -# Default Values -LTM_MEMORY_ID_DEFAULT = "00000" -ST_MEMORY_ID_DEFAULT = "0000" -BUFFER_ID_DEFAULT = "0000" - - -class DifferentiableLayer: - def __init__(self, attention_modulators: dict): - self.weights = {modulator: 1.0 for modulator in attention_modulators} - self.learning_rate = 0.1 - self.regularization_lambda = 0.01 - self.weight_decay = 0.99 - - async def adjust_weights(self, feedbacks: list[float]): - """ - Adjusts the weights of the attention modulators based on user feedbacks. - - Parameters: - - feedbacks: A list of feedback scores (between 0 and 1). - """ - avg_feedback = np.mean(feedbacks) - feedback_diff = 1.0 - avg_feedback - - # Adjust weights based on average feedback - for modulator in self.weights: - self.weights[modulator] += self.learning_rate * (-feedback_diff) - self.regularization_lambda * \ - self.weights[modulator] - self.weights[modulator] *= self.weight_decay - - # Decaying the learning rate - self.learning_rate *= 0.99 - - async def get_weights(self): - return self.weights - -class VectorDBFactory: - def create_vector_db( - self, - user_id: str, - index_name: str, - memory_id: str, - ltm_memory_id: str = LTM_MEMORY_ID_DEFAULT, - st_memory_id: str = ST_MEMORY_ID_DEFAULT, - buffer_id: str = BUFFER_ID_DEFAULT, - db_type: str = "pinecone", - namespace: str = None, - ): - db_map = {"pinecone": PineconeVectorDB, "weaviate": WeaviateVectorDB} - - if db_type in db_map: - return db_map[db_type]( - user_id, - index_name, - memory_id, - ltm_memory_id, - st_memory_id, - buffer_id, - namespace, - ) - - raise ValueError(f"Unsupported database type: {db_type}") - - -class VectorDB: - OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "") - - def __init__( - self, - user_id: str, - index_name: str, - memory_id: str, - ltm_memory_id: str = LTM_MEMORY_ID_DEFAULT, - st_memory_id: str = ST_MEMORY_ID_DEFAULT, - buffer_id: str = BUFFER_ID_DEFAULT, - namespace: str = None, - ): - self.user_id = user_id - self.index_name = index_name - self.namespace = namespace - self.memory_id = memory_id - self.ltm_memory_id = ltm_memory_id - self.st_memory_id = st_memory_id - self.buffer_id = buffer_id - - -class PineconeVectorDB(VectorDB): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.init_pinecone(self.index_name) - - def init_pinecone(self, index_name): - # Pinecone initialization logic - pass - - -class WeaviateVectorDB(VectorDB): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.init_weaviate(self.namespace) - - def init_weaviate(self, namespace: str): - # Weaviate initialization logic - embeddings = OpenAIEmbeddings() - auth_config = weaviate.auth.AuthApiKey( - api_key=os.environ.get("WEAVIATE_API_KEY") - ) - client = weaviate.Client( - url=os.environ.get("WEAVIATE_URL"), - auth_client_secret=auth_config, - additional_headers={"X-OpenAI-Api-Key": os.environ.get("OPENAI_API_KEY")}, - ) - retriever = WeaviateHybridSearchRetriever( - client=client, - index_name=namespace, - text_key="text", - attributes=[], - embedding=embeddings, - create_schema_if_missing=True, - ) - return retriever # If this is part of the initialization, call it here. - - def init_weaviate_client(self, namespace: str): - # Weaviate client initialization logic - auth_config = weaviate.auth.AuthApiKey( - api_key=os.environ.get("WEAVIATE_API_KEY") - ) - client = weaviate.Client( - url=os.environ.get("WEAVIATE_URL"), - auth_client_secret=auth_config, - additional_headers={"X-OpenAI-Api-Key": os.environ.get("OPENAI_API_KEY")}, - ) - return client - - def _document_loader(self, observation: str, loader_settings: dict): - # Create an in-memory file-like object for the PDF content - - if loader_settings.get("format") == "PDF": - - if loader_settings.get("source") == "url": - pdf_response = requests.get(loader_settings["path"]) - pdf_stream = BytesIO(pdf_response.content) - contents = pdf_stream.read() - tmp_location = os.path.join("/tmp", "tmp.pdf") - with open(tmp_location, "wb") as tmp_file: - tmp_file.write(contents) - - # Process the PDF using PyPDFLoader - loader = PyPDFLoader(tmp_location) - # adapt this for different chunking strategies - pages = loader.load_and_split() - return pages - - if loader_settings.get("source") == "file": - # Process the PDF using PyPDFLoader - # might need adapting for different loaders + OCR - # need to test the path - loader = PyPDFLoader(loader_settings["path"]) - pages = loader.load_and_split() - - return pages - else: - # Process the text by just loading the base text - return observation - - - async def add_memories( - self, observation: str, loader_settings: dict = None, params: dict = None ,namespace:str=None - ): - # Update Weaviate memories here - print(self.namespace) - if namespace is None: - namespace = self.namespace - retriever = self.init_weaviate(namespace) - - def _stuct(observation, params): - """Utility function to not repeat metadata structure""" - # needs smarter solution, like dynamic generation of metadata - return [ - Document( - metadata={ - # "text": observation, - "user_id": str(self.user_id), - "memory_id": str(self.memory_id), - "ltm_memory_id": str(self.ltm_memory_id), - "st_memory_id": str(self.st_memory_id), - "buffer_id": str(self.buffer_id), - "version": params.get("version", None) or "", - "agreement_id": params.get("agreement_id", None) or "", - "privacy_policy": params.get("privacy_policy", None) or "", - "terms_of_service": params.get("terms_of_service", None) or "", - "format": params.get("format", None) or "", - "schema_version": params.get("schema_version", None) or "", - "checksum": params.get("checksum", None) or "", - "owner": params.get("owner", None) or "", - "license": params.get("license", None) or "", - "validity_start": params.get("validity_start", None) or "", - "validity_end": params.get("validity_end", None) or "" - # **source_metadata, - }, - page_content=observation, - ) - ] - - if loader_settings: - # Load the document - document = self._document_loader(observation, loader_settings) - print("DOC LENGTH", len(document)) - for doc in document: - document_to_load = _stuct(doc.page_content, params) - retriever.add_documents( - document_to_load - ) - - return retriever.add_documents( - _stuct(observation, params) - ) - - async def fetch_memories( - self, observation: str, namespace: str, params: dict = None, n_of_observations =int(2) - ): - """ - Get documents from weaviate. - - Parameters: - - observation (str): User query. - - namespace (str): Type of memory we access. - - params (dict, optional): - - n_of_observations (int, optional): For weaviate, equals to autocut, defaults to 1. Ranges from 1 to 3. Check weaviate docs for more info. - - Returns: - Describe the return type and what the function returns. - - Args a json containing: - query (str): The query string. - path (list): The path for filtering, e.g., ['year']. - operator (str): The operator for filtering, e.g., 'Equal'. - valueText (str): The value for filtering, e.g., '2017*'. - - Example: - get_from_weaviate(query="some query", path=['year'], operator='Equal', valueText='2017*') - - """ - client = self.init_weaviate_client(self.namespace) - - print(self.namespace) - print(str(datetime.now())) - print(observation) - if namespace is None: - namespace = self.namespace - - params_user_id = { - "path": ["user_id"], - "operator": "Like", - "valueText": self.user_id, - } - - if params: - query_output = ( - client.query.get( - namespace, - [ - # "text", - "user_id", - "memory_id", - "ltm_memory_id", - "st_memory_id", - "buffer_id", - "version", - "agreement_id", - "privacy_policy", - "terms_of_service", - "format", - "schema_version", - "checksum", - "owner", - "license", - "validity_start", - "validity_end", - ], - ) - .with_where(params) - .with_near_text({"concepts": [observation]}) - .with_additional( - ["id", "creationTimeUnix", "lastUpdateTimeUnix", "score",'distance'] - ) - .with_where(params_user_id) - .with_limit(10) - .do() - ) - return query_output - else: - query_output = ( - client.query.get( - namespace, - - [ - "text", - "user_id", - "memory_id", - "ltm_memory_id", - "st_memory_id", - "buffer_id", - "version", - "agreement_id", - "privacy_policy", - "terms_of_service", - "format", - "schema_version", - "checksum", - "owner", - "license", - "validity_start", - "validity_end", - ], - ) - .with_additional( - ["id", "creationTimeUnix", "lastUpdateTimeUnix", "score", 'distance'] - ) - .with_hybrid( - query=observation, - fusion_type=HybridFusion.RELATIVE_SCORE - ) - .with_autocut(n_of_observations) - .with_where(params_user_id) - .with_limit(10) - .do() - ) - return query_output - - async def delete_memories(self, params: dict = None): - client = self.init_weaviate_client(self.namespace) - if params: - where_filter = { - "path": ["id"], - "operator": "Equal", - "valueText": params.get("id", None), - } - return client.batch.delete_objects( - class_name=self.namespace, - # Same `where` filter as in the GraphQL API - where=where_filter, - ) - else: - # Delete all objects - print("HERE IS THE USER ID", self.user_id) - return client.batch.delete_objects( - class_name=self.namespace, - where={ - "path": ["user_id"], - "operator": "Equal", - "valueText": self.user_id, - }, - ) - - def update_memories(self, observation, namespace: str, params: dict = None): - client = self.init_weaviate_client(self.namespace) - - client.data_object.update( - data_object={ - # "text": observation, - "user_id": str(self.user_id), - "memory_id": str(self.memory_id), - "ltm_memory_id": str(self.ltm_memory_id), - "st_memory_id": str(self.st_memory_id), - "buffer_id": str(self.buffer_id), - "version": params.get("version", None) or "", - "agreement_id": params.get("agreement_id", None) or "", - "privacy_policy": params.get("privacy_policy", None) or "", - "terms_of_service": params.get("terms_of_service", None) or "", - "format": params.get("format", None) or "", - "schema_version": params.get("schema_version", None) or "", - "checksum": params.get("checksum", None) or "", - "owner": params.get("owner", None) or "", - "license": params.get("license", None) or "", - "validity_start": params.get("validity_start", None) or "", - "validity_end": params.get("validity_end", None) or "" - # **source_metadata, - }, - class_name="Test", - uuid=params.get("id", None), - consistency_level=weaviate.data.replication.ConsistencyLevel.ALL, # default QUORUM - ) - return - - -class BaseMemory: - def __init__( - self, - user_id: str, - memory_id: Optional[str], - index_name: Optional[str], - db_type: str, - namespace: str, - ): - self.user_id = user_id - self.memory_id = memory_id - self.index_name = index_name - self.namespace = namespace - self.memory_type_id = str(uuid.uuid4()) - self.db_type = db_type - factory = VectorDBFactory() - self.vector_db = factory.create_vector_db( - self.user_id, - self.index_name, - self.memory_id, - db_type=self.db_type, - namespace=self.namespace, - ) - - def init_client(self, namespace: str): - if self.db_type == "weaviate": - return self.vector_db.init_weaviate_client(namespace) - - async def add_memories( - self, - observation: Optional[str] = None, - loader_settings: dict = None, - params: Optional[dict] = None, - namespace: Optional[str] = None, - ): - if self.db_type == "weaviate": - return await self.vector_db.add_memories( - observation=observation, loader_settings=loader_settings, - params=params, namespace=namespace - ) - # Add other db_type conditions if necessary - - async def fetch_memories( - self, - observation: str, - params: Optional[str] = None, - namespace: Optional[str] = None, - n_of_observations: Optional[int] = 2, - ): - if self.db_type == "weaviate": - return await self.vector_db.fetch_memories( - observation=observation, params=params, - namespace=namespace, - n_of_observations=n_of_observations - ) - - async def delete_memories(self, params: Optional[str] = None): - if self.db_type == "weaviate": - return await self.vector_db.delete_memories(params) - - # Additional methods for specific Memory can be added here +from vectordb.basevectordb import BaseMemory, PineconeVectorDB, WeaviateVectorDB +from modulators.modulators import DifferentiableLayer class SemanticMemory(BaseMemory): diff --git a/level_2/modulators/modulators.py b/level_2/modulators/modulators.py index e69de29bb..4efefb044 100644 --- a/level_2/modulators/modulators.py +++ b/level_2/modulators/modulators.py @@ -0,0 +1,32 @@ +import numpy as np + + +class DifferentiableLayer: + def __init__(self, attention_modulators: dict): + self.weights = {modulator: 1.0 for modulator in attention_modulators} + self.learning_rate = 0.1 + self.regularization_lambda = 0.01 + self.weight_decay = 0.99 + + async def adjust_weights(self, feedbacks: list[float]): + """ + Adjusts the weights of the attention modulators based on user feedbacks. + + Parameters: + - feedbacks: A list of feedback scores (between 0 and 1). + """ + avg_feedback = np.mean(feedbacks) + feedback_diff = 1.0 - avg_feedback + + # Adjust weights based on average feedback + for modulator in self.weights: + self.weights[modulator] += self.learning_rate * (-feedback_diff) - self.regularization_lambda * \ + self.weights[modulator] + self.weights[modulator] *= self.weight_decay + + # Decaying the learning rate + self.learning_rate *= 0.99 + + async def get_weights(self): + return self.weights + diff --git a/level_2/vectordb/basevectordb.py b/level_2/vectordb/basevectordb.py new file mode 100644 index 000000000..664526281 --- /dev/null +++ b/level_2/vectordb/basevectordb.py @@ -0,0 +1,447 @@ +# Make sure to install the following packages: dlt, langchain, duckdb, python-dotenv, openai, weaviate-client +import logging +from io import BytesIO + +logging.basicConfig(level=logging.INFO) +import marvin +import requests +from dotenv import load_dotenv +from langchain.document_loaders import PyPDFLoader +from langchain.retrievers import WeaviateHybridSearchRetriever +from weaviate.gql.get import HybridFusion + +load_dotenv() +from typing import Optional + +import tracemalloc + +tracemalloc.start() + +import os +from datetime import datetime +from langchain.embeddings.openai import OpenAIEmbeddings +from dotenv import load_dotenv +from langchain.schema import Document +import uuid +import weaviate + +load_dotenv() + +OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "") +marvin.settings.openai.api_key = os.environ.get("OPENAI_API_KEY") + +LTM_MEMORY_ID_DEFAULT = "00000" +ST_MEMORY_ID_DEFAULT = "0000" +BUFFER_ID_DEFAULT = "0000" + + +class VectorDBFactory: + def create_vector_db( + self, + user_id: str, + index_name: str, + memory_id: str, + ltm_memory_id: str = LTM_MEMORY_ID_DEFAULT, + st_memory_id: str = ST_MEMORY_ID_DEFAULT, + buffer_id: str = BUFFER_ID_DEFAULT, + db_type: str = "pinecone", + namespace: str = None, + ): + db_map = {"pinecone": PineconeVectorDB, "weaviate": WeaviateVectorDB} + + if db_type in db_map: + return db_map[db_type]( + user_id, + index_name, + memory_id, + ltm_memory_id, + st_memory_id, + buffer_id, + namespace, + ) + + raise ValueError(f"Unsupported database type: {db_type}") + + +class VectorDB: + OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "") + + def __init__( + self, + user_id: str, + index_name: str, + memory_id: str, + ltm_memory_id: str = LTM_MEMORY_ID_DEFAULT, + st_memory_id: str = ST_MEMORY_ID_DEFAULT, + buffer_id: str = BUFFER_ID_DEFAULT, + namespace: str = None, + ): + self.user_id = user_id + self.index_name = index_name + self.namespace = namespace + self.memory_id = memory_id + self.ltm_memory_id = ltm_memory_id + self.st_memory_id = st_memory_id + self.buffer_id = buffer_id + +class PineconeVectorDB(VectorDB): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.init_pinecone(self.index_name) + + def init_pinecone(self, index_name): + # Pinecone initialization logic + pass + + +class WeaviateVectorDB(VectorDB): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.init_weaviate(self.namespace) + + def init_weaviate(self, namespace: str): + # Weaviate initialization logic + embeddings = OpenAIEmbeddings() + auth_config = weaviate.auth.AuthApiKey( + api_key=os.environ.get("WEAVIATE_API_KEY") + ) + client = weaviate.Client( + url=os.environ.get("WEAVIATE_URL"), + auth_client_secret=auth_config, + additional_headers={"X-OpenAI-Api-Key": os.environ.get("OPENAI_API_KEY")}, + ) + retriever = WeaviateHybridSearchRetriever( + client=client, + index_name=namespace, + text_key="text", + attributes=[], + embedding=embeddings, + create_schema_if_missing=True, + ) + return retriever # If this is part of the initialization, call it here. + + def init_weaviate_client(self, namespace: str): + # Weaviate client initialization logic + auth_config = weaviate.auth.AuthApiKey( + api_key=os.environ.get("WEAVIATE_API_KEY") + ) + client = weaviate.Client( + url=os.environ.get("WEAVIATE_URL"), + auth_client_secret=auth_config, + additional_headers={"X-OpenAI-Api-Key": os.environ.get("OPENAI_API_KEY")}, + ) + return client + + def _document_loader(self, observation: str, loader_settings: dict): + # Create an in-memory file-like object for the PDF content + + if loader_settings.get("format") == "PDF": + + if loader_settings.get("source") == "url": + pdf_response = requests.get(loader_settings["path"]) + pdf_stream = BytesIO(pdf_response.content) + contents = pdf_stream.read() + tmp_location = os.path.join("/tmp", "tmp.pdf") + with open(tmp_location, "wb") as tmp_file: + tmp_file.write(contents) + + # Process the PDF using PyPDFLoader + loader = PyPDFLoader(tmp_location) + # adapt this for different chunking strategies + pages = loader.load_and_split() + return pages + + if loader_settings.get("source") == "file": + # Process the PDF using PyPDFLoader + # might need adapting for different loaders + OCR + # need to test the path + loader = PyPDFLoader(loader_settings["path"]) + pages = loader.load_and_split() + + return pages + else: + # Process the text by just loading the base text + return observation + + + async def add_memories( + self, observation: str, loader_settings: dict = None, params: dict = None ,namespace:str=None + ): + # Update Weaviate memories here + print(self.namespace) + if namespace is None: + namespace = self.namespace + retriever = self.init_weaviate(namespace) + + def _stuct(observation, params): + """Utility function to not repeat metadata structure""" + # needs smarter solution, like dynamic generation of metadata + return [ + Document( + metadata={ + # "text": observation, + "user_id": str(self.user_id), + "memory_id": str(self.memory_id), + "ltm_memory_id": str(self.ltm_memory_id), + "st_memory_id": str(self.st_memory_id), + "buffer_id": str(self.buffer_id), + "version": params.get("version", None) or "", + "agreement_id": params.get("agreement_id", None) or "", + "privacy_policy": params.get("privacy_policy", None) or "", + "terms_of_service": params.get("terms_of_service", None) or "", + "format": params.get("format", None) or "", + "schema_version": params.get("schema_version", None) or "", + "checksum": params.get("checksum", None) or "", + "owner": params.get("owner", None) or "", + "license": params.get("license", None) or "", + "validity_start": params.get("validity_start", None) or "", + "validity_end": params.get("validity_end", None) or "" + # **source_metadata, + }, + page_content=observation, + ) + ] + + if loader_settings: + # Load the document + document = self._document_loader(observation, loader_settings) + print("DOC LENGTH", len(document)) + for doc in document: + document_to_load = _stuct(doc.page_content, params) + retriever.add_documents( + document_to_load + ) + + return retriever.add_documents( + _stuct(observation, params) + ) + + async def fetch_memories( + self, observation: str, namespace: str, params: dict = None, n_of_observations =int(2) + ): + """ + Get documents from weaviate. + + Parameters: + - observation (str): User query. + - namespace (str): Type of memory we access. + - params (dict, optional): + - n_of_observations (int, optional): For weaviate, equals to autocut, defaults to 1. Ranges from 1 to 3. Check weaviate docs for more info. + + Returns: + Describe the return type and what the function returns. + + Args a json containing: + query (str): The query string. + path (list): The path for filtering, e.g., ['year']. + operator (str): The operator for filtering, e.g., 'Equal'. + valueText (str): The value for filtering, e.g., '2017*'. + + Example: + get_from_weaviate(query="some query", path=['year'], operator='Equal', valueText='2017*') + + """ + client = self.init_weaviate_client(self.namespace) + + print(self.namespace) + print(str(datetime.now())) + print(observation) + if namespace is None: + namespace = self.namespace + + params_user_id = { + "path": ["user_id"], + "operator": "Like", + "valueText": self.user_id, + } + + if params: + query_output = ( + client.query.get( + namespace, + [ + # "text", + "user_id", + "memory_id", + "ltm_memory_id", + "st_memory_id", + "buffer_id", + "version", + "agreement_id", + "privacy_policy", + "terms_of_service", + "format", + "schema_version", + "checksum", + "owner", + "license", + "validity_start", + "validity_end", + ], + ) + .with_where(params) + .with_near_text({"concepts": [observation]}) + .with_additional( + ["id", "creationTimeUnix", "lastUpdateTimeUnix", "score",'distance'] + ) + .with_where(params_user_id) + .with_limit(10) + .do() + ) + return query_output + else: + query_output = ( + client.query.get( + namespace, + + [ + "text", + "user_id", + "memory_id", + "ltm_memory_id", + "st_memory_id", + "buffer_id", + "version", + "agreement_id", + "privacy_policy", + "terms_of_service", + "format", + "schema_version", + "checksum", + "owner", + "license", + "validity_start", + "validity_end", + ], + ) + .with_additional( + ["id", "creationTimeUnix", "lastUpdateTimeUnix", "score", 'distance'] + ) + .with_hybrid( + query=observation, + fusion_type=HybridFusion.RELATIVE_SCORE + ) + .with_autocut(n_of_observations) + .with_where(params_user_id) + .with_limit(10) + .do() + ) + return query_output + + async def delete_memories(self, params: dict = None): + client = self.init_weaviate_client(self.namespace) + if params: + where_filter = { + "path": ["id"], + "operator": "Equal", + "valueText": params.get("id", None), + } + return client.batch.delete_objects( + class_name=self.namespace, + # Same `where` filter as in the GraphQL API + where=where_filter, + ) + else: + # Delete all objects + print("HERE IS THE USER ID", self.user_id) + return client.batch.delete_objects( + class_name=self.namespace, + where={ + "path": ["user_id"], + "operator": "Equal", + "valueText": self.user_id, + }, + ) + + def update_memories(self, observation, namespace: str, params: dict = None): + client = self.init_weaviate_client(self.namespace) + + client.data_object.update( + data_object={ + # "text": observation, + "user_id": str(self.user_id), + "memory_id": str(self.memory_id), + "ltm_memory_id": str(self.ltm_memory_id), + "st_memory_id": str(self.st_memory_id), + "buffer_id": str(self.buffer_id), + "version": params.get("version", None) or "", + "agreement_id": params.get("agreement_id", None) or "", + "privacy_policy": params.get("privacy_policy", None) or "", + "terms_of_service": params.get("terms_of_service", None) or "", + "format": params.get("format", None) or "", + "schema_version": params.get("schema_version", None) or "", + "checksum": params.get("checksum", None) or "", + "owner": params.get("owner", None) or "", + "license": params.get("license", None) or "", + "validity_start": params.get("validity_start", None) or "", + "validity_end": params.get("validity_end", None) or "" + # **source_metadata, + }, + class_name="Test", + uuid=params.get("id", None), + consistency_level=weaviate.data.replication.ConsistencyLevel.ALL, # default QUORUM + ) + return + + +class BaseMemory: + def __init__( + self, + user_id: str, + memory_id: Optional[str], + index_name: Optional[str], + db_type: str, + namespace: str, + ): + self.user_id = user_id + self.memory_id = memory_id + self.index_name = index_name + self.namespace = namespace + self.memory_type_id = str(uuid.uuid4()) + self.db_type = db_type + factory = VectorDBFactory() + self.vector_db = factory.create_vector_db( + self.user_id, + self.index_name, + self.memory_id, + db_type=self.db_type, + namespace=self.namespace, + ) + + def init_client(self, namespace: str): + if self.db_type == "weaviate": + return self.vector_db.init_weaviate_client(namespace) + + async def add_memories( + self, + observation: Optional[str] = None, + loader_settings: dict = None, + params: Optional[dict] = None, + namespace: Optional[str] = None, + ): + if self.db_type == "weaviate": + return await self.vector_db.add_memories( + observation=observation, loader_settings=loader_settings, + params=params, namespace=namespace + ) + # Add other db_type conditions if necessary + + async def fetch_memories( + self, + observation: str, + params: Optional[str] = None, + namespace: Optional[str] = None, + n_of_observations: Optional[int] = 2, + ): + if self.db_type == "weaviate": + return await self.vector_db.fetch_memories( + observation=observation, params=params, + namespace=namespace, + n_of_observations=n_of_observations + ) + + async def delete_memories(self, params: Optional[str] = None): + if self.db_type == "weaviate": + return await self.vector_db.delete_memories(params) + + # Additional methods for specific Memory can be added here From 6e01e9af79f100330ea4cc702c6bf6f0dab4e002 Mon Sep 17 00:00:00 2001 From: Vasilije <8619304+Vasilije1990@users.noreply.github.com> Date: Tue, 12 Sep 2023 16:09:18 +0200 Subject: [PATCH 5/5] Split the files and fixed issues with: 1. wrong uuid for st memmory 2. weaviate checker logic that was not needed 3. decomposed vector db and factory classes into separate files --- .../level_2_pdf_vectorstore__dlt_contracts.py | 6 +- level_2/vectordb/basevectordb.py | 348 +---------------- level_2/vectordb/vectordb.py | 355 ++++++++++++++++++ 3 files changed, 375 insertions(+), 334 deletions(-) create mode 100644 level_2/vectordb/vectordb.py diff --git a/level_2/level_2_pdf_vectorstore__dlt_contracts.py b/level_2/level_2_pdf_vectorstore__dlt_contracts.py index a5a3dd52d..9799d651f 100644 --- a/level_2/level_2_pdf_vectorstore__dlt_contracts.py +++ b/level_2/level_2_pdf_vectorstore__dlt_contracts.py @@ -74,7 +74,9 @@ marvin.settings.openai.api_key = os.environ.get("OPENAI_API_KEY") # Assuming OpenAIEmbeddings and other necessary imports are available -from vectordb.basevectordb import BaseMemory, PineconeVectorDB, WeaviateVectorDB +from vectordb.basevectordb import BaseMemory + + from modulators.modulators import DifferentiableLayer @@ -115,7 +117,7 @@ class EpisodicBuffer(BaseMemory): user_id, memory_id, index_name, db_type, namespace="BUFFERMEMORY" ) - self.st_memory_id = "blah" + self.st_memory_id = str( uuid.uuid4()) self.llm = ChatOpenAI( temperature=0.0, max_tokens=1200, diff --git a/level_2/vectordb/basevectordb.py b/level_2/vectordb/basevectordb.py index 664526281..3093286f7 100644 --- a/level_2/vectordb/basevectordb.py +++ b/level_2/vectordb/basevectordb.py @@ -2,6 +2,8 @@ import logging from io import BytesIO +from level_2.vectordb.vectordb import PineconeVectorDB, WeaviateVectorDB + logging.basicConfig(level=logging.INFO) import marvin import requests @@ -63,325 +65,8 @@ class VectorDBFactory: raise ValueError(f"Unsupported database type: {db_type}") -class VectorDB: - OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "") - - def __init__( - self, - user_id: str, - index_name: str, - memory_id: str, - ltm_memory_id: str = LTM_MEMORY_ID_DEFAULT, - st_memory_id: str = ST_MEMORY_ID_DEFAULT, - buffer_id: str = BUFFER_ID_DEFAULT, - namespace: str = None, - ): - self.user_id = user_id - self.index_name = index_name - self.namespace = namespace - self.memory_id = memory_id - self.ltm_memory_id = ltm_memory_id - self.st_memory_id = st_memory_id - self.buffer_id = buffer_id - -class PineconeVectorDB(VectorDB): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.init_pinecone(self.index_name) - - def init_pinecone(self, index_name): - # Pinecone initialization logic - pass -class WeaviateVectorDB(VectorDB): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.init_weaviate(self.namespace) - - def init_weaviate(self, namespace: str): - # Weaviate initialization logic - embeddings = OpenAIEmbeddings() - auth_config = weaviate.auth.AuthApiKey( - api_key=os.environ.get("WEAVIATE_API_KEY") - ) - client = weaviate.Client( - url=os.environ.get("WEAVIATE_URL"), - auth_client_secret=auth_config, - additional_headers={"X-OpenAI-Api-Key": os.environ.get("OPENAI_API_KEY")}, - ) - retriever = WeaviateHybridSearchRetriever( - client=client, - index_name=namespace, - text_key="text", - attributes=[], - embedding=embeddings, - create_schema_if_missing=True, - ) - return retriever # If this is part of the initialization, call it here. - - def init_weaviate_client(self, namespace: str): - # Weaviate client initialization logic - auth_config = weaviate.auth.AuthApiKey( - api_key=os.environ.get("WEAVIATE_API_KEY") - ) - client = weaviate.Client( - url=os.environ.get("WEAVIATE_URL"), - auth_client_secret=auth_config, - additional_headers={"X-OpenAI-Api-Key": os.environ.get("OPENAI_API_KEY")}, - ) - return client - - def _document_loader(self, observation: str, loader_settings: dict): - # Create an in-memory file-like object for the PDF content - - if loader_settings.get("format") == "PDF": - - if loader_settings.get("source") == "url": - pdf_response = requests.get(loader_settings["path"]) - pdf_stream = BytesIO(pdf_response.content) - contents = pdf_stream.read() - tmp_location = os.path.join("/tmp", "tmp.pdf") - with open(tmp_location, "wb") as tmp_file: - tmp_file.write(contents) - - # Process the PDF using PyPDFLoader - loader = PyPDFLoader(tmp_location) - # adapt this for different chunking strategies - pages = loader.load_and_split() - return pages - - if loader_settings.get("source") == "file": - # Process the PDF using PyPDFLoader - # might need adapting for different loaders + OCR - # need to test the path - loader = PyPDFLoader(loader_settings["path"]) - pages = loader.load_and_split() - - return pages - else: - # Process the text by just loading the base text - return observation - - - async def add_memories( - self, observation: str, loader_settings: dict = None, params: dict = None ,namespace:str=None - ): - # Update Weaviate memories here - print(self.namespace) - if namespace is None: - namespace = self.namespace - retriever = self.init_weaviate(namespace) - - def _stuct(observation, params): - """Utility function to not repeat metadata structure""" - # needs smarter solution, like dynamic generation of metadata - return [ - Document( - metadata={ - # "text": observation, - "user_id": str(self.user_id), - "memory_id": str(self.memory_id), - "ltm_memory_id": str(self.ltm_memory_id), - "st_memory_id": str(self.st_memory_id), - "buffer_id": str(self.buffer_id), - "version": params.get("version", None) or "", - "agreement_id": params.get("agreement_id", None) or "", - "privacy_policy": params.get("privacy_policy", None) or "", - "terms_of_service": params.get("terms_of_service", None) or "", - "format": params.get("format", None) or "", - "schema_version": params.get("schema_version", None) or "", - "checksum": params.get("checksum", None) or "", - "owner": params.get("owner", None) or "", - "license": params.get("license", None) or "", - "validity_start": params.get("validity_start", None) or "", - "validity_end": params.get("validity_end", None) or "" - # **source_metadata, - }, - page_content=observation, - ) - ] - - if loader_settings: - # Load the document - document = self._document_loader(observation, loader_settings) - print("DOC LENGTH", len(document)) - for doc in document: - document_to_load = _stuct(doc.page_content, params) - retriever.add_documents( - document_to_load - ) - - return retriever.add_documents( - _stuct(observation, params) - ) - - async def fetch_memories( - self, observation: str, namespace: str, params: dict = None, n_of_observations =int(2) - ): - """ - Get documents from weaviate. - - Parameters: - - observation (str): User query. - - namespace (str): Type of memory we access. - - params (dict, optional): - - n_of_observations (int, optional): For weaviate, equals to autocut, defaults to 1. Ranges from 1 to 3. Check weaviate docs for more info. - - Returns: - Describe the return type and what the function returns. - - Args a json containing: - query (str): The query string. - path (list): The path for filtering, e.g., ['year']. - operator (str): The operator for filtering, e.g., 'Equal'. - valueText (str): The value for filtering, e.g., '2017*'. - - Example: - get_from_weaviate(query="some query", path=['year'], operator='Equal', valueText='2017*') - - """ - client = self.init_weaviate_client(self.namespace) - - print(self.namespace) - print(str(datetime.now())) - print(observation) - if namespace is None: - namespace = self.namespace - - params_user_id = { - "path": ["user_id"], - "operator": "Like", - "valueText": self.user_id, - } - - if params: - query_output = ( - client.query.get( - namespace, - [ - # "text", - "user_id", - "memory_id", - "ltm_memory_id", - "st_memory_id", - "buffer_id", - "version", - "agreement_id", - "privacy_policy", - "terms_of_service", - "format", - "schema_version", - "checksum", - "owner", - "license", - "validity_start", - "validity_end", - ], - ) - .with_where(params) - .with_near_text({"concepts": [observation]}) - .with_additional( - ["id", "creationTimeUnix", "lastUpdateTimeUnix", "score",'distance'] - ) - .with_where(params_user_id) - .with_limit(10) - .do() - ) - return query_output - else: - query_output = ( - client.query.get( - namespace, - - [ - "text", - "user_id", - "memory_id", - "ltm_memory_id", - "st_memory_id", - "buffer_id", - "version", - "agreement_id", - "privacy_policy", - "terms_of_service", - "format", - "schema_version", - "checksum", - "owner", - "license", - "validity_start", - "validity_end", - ], - ) - .with_additional( - ["id", "creationTimeUnix", "lastUpdateTimeUnix", "score", 'distance'] - ) - .with_hybrid( - query=observation, - fusion_type=HybridFusion.RELATIVE_SCORE - ) - .with_autocut(n_of_observations) - .with_where(params_user_id) - .with_limit(10) - .do() - ) - return query_output - - async def delete_memories(self, params: dict = None): - client = self.init_weaviate_client(self.namespace) - if params: - where_filter = { - "path": ["id"], - "operator": "Equal", - "valueText": params.get("id", None), - } - return client.batch.delete_objects( - class_name=self.namespace, - # Same `where` filter as in the GraphQL API - where=where_filter, - ) - else: - # Delete all objects - print("HERE IS THE USER ID", self.user_id) - return client.batch.delete_objects( - class_name=self.namespace, - where={ - "path": ["user_id"], - "operator": "Equal", - "valueText": self.user_id, - }, - ) - - def update_memories(self, observation, namespace: str, params: dict = None): - client = self.init_weaviate_client(self.namespace) - - client.data_object.update( - data_object={ - # "text": observation, - "user_id": str(self.user_id), - "memory_id": str(self.memory_id), - "ltm_memory_id": str(self.ltm_memory_id), - "st_memory_id": str(self.st_memory_id), - "buffer_id": str(self.buffer_id), - "version": params.get("version", None) or "", - "agreement_id": params.get("agreement_id", None) or "", - "privacy_policy": params.get("privacy_policy", None) or "", - "terms_of_service": params.get("terms_of_service", None) or "", - "format": params.get("format", None) or "", - "schema_version": params.get("schema_version", None) or "", - "checksum": params.get("checksum", None) or "", - "owner": params.get("owner", None) or "", - "license": params.get("license", None) or "", - "validity_start": params.get("validity_start", None) or "", - "validity_end": params.get("validity_end", None) or "" - # **source_metadata, - }, - class_name="Test", - uuid=params.get("id", None), - consistency_level=weaviate.data.replication.ConsistencyLevel.ALL, # default QUORUM - ) - return class BaseMemory: @@ -409,8 +94,8 @@ class BaseMemory: ) def init_client(self, namespace: str): - if self.db_type == "weaviate": - return self.vector_db.init_weaviate_client(namespace) + + return self.vector_db.init_weaviate_client(namespace) async def add_memories( self, @@ -419,11 +104,11 @@ class BaseMemory: params: Optional[dict] = None, namespace: Optional[str] = None, ): - if self.db_type == "weaviate": - return await self.vector_db.add_memories( - observation=observation, loader_settings=loader_settings, - params=params, namespace=namespace - ) + + return await self.vector_db.add_memories( + observation=observation, loader_settings=loader_settings, + params=params, namespace=namespace + ) # Add other db_type conditions if necessary async def fetch_memories( @@ -433,15 +118,14 @@ class BaseMemory: namespace: Optional[str] = None, n_of_observations: Optional[int] = 2, ): - if self.db_type == "weaviate": - return await self.vector_db.fetch_memories( - observation=observation, params=params, - namespace=namespace, - n_of_observations=n_of_observations - ) + + return await self.vector_db.fetch_memories( + observation=observation, params=params, + namespace=namespace, + n_of_observations=n_of_observations + ) async def delete_memories(self, params: Optional[str] = None): - if self.db_type == "weaviate": - return await self.vector_db.delete_memories(params) + return await self.vector_db.delete_memories(params) # Additional methods for specific Memory can be added here diff --git a/level_2/vectordb/vectordb.py b/level_2/vectordb/vectordb.py new file mode 100644 index 000000000..4fa4f0308 --- /dev/null +++ b/level_2/vectordb/vectordb.py @@ -0,0 +1,355 @@ + +# Make sure to install the following packages: dlt, langchain, duckdb, python-dotenv, openai, weaviate-client +import logging +from io import BytesIO + + + +logging.basicConfig(level=logging.INFO) +import marvin +import requests +from dotenv import load_dotenv +from langchain.document_loaders import PyPDFLoader +from langchain.retrievers import WeaviateHybridSearchRetriever +from weaviate.gql.get import HybridFusion + +load_dotenv() +from typing import Optional + +import tracemalloc + +tracemalloc.start() + +import os +from datetime import datetime +from langchain.embeddings.openai import OpenAIEmbeddings +from dotenv import load_dotenv +from langchain.schema import Document +import uuid +import weaviate + +load_dotenv() + + +LTM_MEMORY_ID_DEFAULT = "00000" +ST_MEMORY_ID_DEFAULT = "0000" +BUFFER_ID_DEFAULT = "0000" +class VectorDB: + OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "") + + def __init__( + self, + user_id: str, + index_name: str, + memory_id: str, + ltm_memory_id: str = LTM_MEMORY_ID_DEFAULT, + st_memory_id: str = ST_MEMORY_ID_DEFAULT, + buffer_id: str = BUFFER_ID_DEFAULT, + namespace: str = None, + ): + self.user_id = user_id + self.index_name = index_name + self.namespace = namespace + self.memory_id = memory_id + self.ltm_memory_id = ltm_memory_id + self.st_memory_id = st_memory_id + self.buffer_id = buffer_id + +class PineconeVectorDB(VectorDB): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.init_pinecone(self.index_name) + + def init_pinecone(self, index_name): + # Pinecone initialization logic + pass + + +class WeaviateVectorDB(VectorDB): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.init_weaviate(self.namespace) + + def init_weaviate(self, namespace: str): + # Weaviate initialization logic + embeddings = OpenAIEmbeddings() + auth_config = weaviate.auth.AuthApiKey( + api_key=os.environ.get("WEAVIATE_API_KEY") + ) + client = weaviate.Client( + url=os.environ.get("WEAVIATE_URL"), + auth_client_secret=auth_config, + additional_headers={"X-OpenAI-Api-Key": os.environ.get("OPENAI_API_KEY")}, + ) + retriever = WeaviateHybridSearchRetriever( + client=client, + index_name=namespace, + text_key="text", + attributes=[], + embedding=embeddings, + create_schema_if_missing=True, + ) + return retriever # If this is part of the initialization, call it here. + + def init_weaviate_client(self, namespace: str): + # Weaviate client initialization logic + auth_config = weaviate.auth.AuthApiKey( + api_key=os.environ.get("WEAVIATE_API_KEY") + ) + client = weaviate.Client( + url=os.environ.get("WEAVIATE_URL"), + auth_client_secret=auth_config, + additional_headers={"X-OpenAI-Api-Key": os.environ.get("OPENAI_API_KEY")}, + ) + return client + + def _document_loader(self, observation: str, loader_settings: dict): + # Create an in-memory file-like object for the PDF content + + if loader_settings.get("format") == "PDF": + + if loader_settings.get("source") == "url": + pdf_response = requests.get(loader_settings["path"]) + pdf_stream = BytesIO(pdf_response.content) + contents = pdf_stream.read() + tmp_location = os.path.join("/tmp", "tmp.pdf") + with open(tmp_location, "wb") as tmp_file: + tmp_file.write(contents) + + # Process the PDF using PyPDFLoader + loader = PyPDFLoader(tmp_location) + # adapt this for different chunking strategies + pages = loader.load_and_split() + return pages + + if loader_settings.get("source") == "file": + # Process the PDF using PyPDFLoader + # might need adapting for different loaders + OCR + # need to test the path + loader = PyPDFLoader(loader_settings["path"]) + pages = loader.load_and_split() + + return pages + else: + # Process the text by just loading the base text + return observation + + + async def add_memories( + self, observation: str, loader_settings: dict = None, params: dict = None ,namespace:str=None + ): + # Update Weaviate memories here + print(self.namespace) + if namespace is None: + namespace = self.namespace + retriever = self.init_weaviate(namespace) + + def _stuct(observation, params): + """Utility function to not repeat metadata structure""" + # needs smarter solution, like dynamic generation of metadata + return [ + Document( + metadata={ + # "text": observation, + "user_id": str(self.user_id), + "memory_id": str(self.memory_id), + "ltm_memory_id": str(self.ltm_memory_id), + "st_memory_id": str(self.st_memory_id), + "buffer_id": str(self.buffer_id), + "version": params.get("version", None) or "", + "agreement_id": params.get("agreement_id", None) or "", + "privacy_policy": params.get("privacy_policy", None) or "", + "terms_of_service": params.get("terms_of_service", None) or "", + "format": params.get("format", None) or "", + "schema_version": params.get("schema_version", None) or "", + "checksum": params.get("checksum", None) or "", + "owner": params.get("owner", None) or "", + "license": params.get("license", None) or "", + "validity_start": params.get("validity_start", None) or "", + "validity_end": params.get("validity_end", None) or "" + # **source_metadata, + }, + page_content=observation, + ) + ] + + if loader_settings: + # Load the document + document = self._document_loader(observation, loader_settings) + print("DOC LENGTH", len(document)) + for doc in document: + document_to_load = _stuct(doc.page_content, params) + retriever.add_documents( + document_to_load + ) + + return retriever.add_documents( + _stuct(observation, params) + ) + + async def fetch_memories( + self, observation: str, namespace: str, params: dict = None, n_of_observations =int(2) + ): + """ + Get documents from weaviate. + + Parameters: + - observation (str): User query. + - namespace (str): Type of memory we access. + - params (dict, optional): + - n_of_observations (int, optional): For weaviate, equals to autocut, defaults to 1. Ranges from 1 to 3. Check weaviate docs for more info. + + Returns: + Describe the return type and what the function returns. + + Args a json containing: + query (str): The query string. + path (list): The path for filtering, e.g., ['year']. + operator (str): The operator for filtering, e.g., 'Equal'. + valueText (str): The value for filtering, e.g., '2017*'. + + Example: + get_from_weaviate(query="some query", path=['year'], operator='Equal', valueText='2017*') + + """ + client = self.init_weaviate_client(self.namespace) + + print(self.namespace) + print(str(datetime.now())) + print(observation) + if namespace is None: + namespace = self.namespace + + params_user_id = { + "path": ["user_id"], + "operator": "Like", + "valueText": self.user_id, + } + + if params: + query_output = ( + client.query.get( + namespace, + [ + # "text", + "user_id", + "memory_id", + "ltm_memory_id", + "st_memory_id", + "buffer_id", + "version", + "agreement_id", + "privacy_policy", + "terms_of_service", + "format", + "schema_version", + "checksum", + "owner", + "license", + "validity_start", + "validity_end", + ], + ) + .with_where(params) + .with_near_text({"concepts": [observation]}) + .with_additional( + ["id", "creationTimeUnix", "lastUpdateTimeUnix", "score",'distance'] + ) + .with_where(params_user_id) + .with_limit(10) + .do() + ) + return query_output + else: + query_output = ( + client.query.get( + namespace, + + [ + "text", + "user_id", + "memory_id", + "ltm_memory_id", + "st_memory_id", + "buffer_id", + "version", + "agreement_id", + "privacy_policy", + "terms_of_service", + "format", + "schema_version", + "checksum", + "owner", + "license", + "validity_start", + "validity_end", + ], + ) + .with_additional( + ["id", "creationTimeUnix", "lastUpdateTimeUnix", "score", 'distance'] + ) + .with_hybrid( + query=observation, + fusion_type=HybridFusion.RELATIVE_SCORE + ) + .with_autocut(n_of_observations) + .with_where(params_user_id) + .with_limit(10) + .do() + ) + return query_output + + async def delete_memories(self, params: dict = None): + client = self.init_weaviate_client(self.namespace) + if params: + where_filter = { + "path": ["id"], + "operator": "Equal", + "valueText": params.get("id", None), + } + return client.batch.delete_objects( + class_name=self.namespace, + # Same `where` filter as in the GraphQL API + where=where_filter, + ) + else: + # Delete all objects + print("HERE IS THE USER ID", self.user_id) + return client.batch.delete_objects( + class_name=self.namespace, + where={ + "path": ["user_id"], + "operator": "Equal", + "valueText": self.user_id, + }, + ) + + def update_memories(self, observation, namespace: str, params: dict = None): + client = self.init_weaviate_client(self.namespace) + + client.data_object.update( + data_object={ + # "text": observation, + "user_id": str(self.user_id), + "memory_id": str(self.memory_id), + "ltm_memory_id": str(self.ltm_memory_id), + "st_memory_id": str(self.st_memory_id), + "buffer_id": str(self.buffer_id), + "version": params.get("version", None) or "", + "agreement_id": params.get("agreement_id", None) or "", + "privacy_policy": params.get("privacy_policy", None) or "", + "terms_of_service": params.get("terms_of_service", None) or "", + "format": params.get("format", None) or "", + "schema_version": params.get("schema_version", None) or "", + "checksum": params.get("checksum", None) or "", + "owner": params.get("owner", None) or "", + "license": params.get("license", None) or "", + "validity_start": params.get("validity_start", None) or "", + "validity_end": params.get("validity_end", None) or "" + # **source_metadata, + }, + class_name="Test", + uuid=params.get("id", None), + consistency_level=weaviate.data.replication.ConsistencyLevel.ALL, # default QUORUM + ) + return