added level 2 initial code

2023-08-17 17:30:40 +02:00 · 2023-08-17 17:30:40 +02:00 · 81b6cc1920
commit 81b6cc1920
parent e8fc6c8952
15 changed files with 5063 additions and 0 deletions
--- a/level_2/.env
+++ b/level_2/.env
@ -0,0 +1,11 @@
    OPENAI_API_KEY      = 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
    PINECONE_API_KEY      = 4e0982ad-57d5-41ac-bce5-d1fd2c2da273
    PINECONE_API_ENV      = us-west1-gcp
    REPLICATE_API_TOKEN      = 4e0982ad-57d5-41ac-bce5-d1fd2c2da273
    GPLACES_API_KEY      = AIzaSyAfuT9tBy6wC3phZR1Tl5acknNA_TU2mKE
    REDIS_HOST=redis
    SERPAPI_API_KEY=17bb94b76b0d7cf3fb1c36d8376e0fc4c3ed761e862b05ef154e116d73c39da5
    ZAPIER_NLA_API_KEY=sk-ak-GtXls7Y5JcPOSbWw7SZDzSvtAF
    LOCAL_DEV = True
    WEAVIATE_API_KEY =shCCL5EVpOKxIdZhMRH090lFqDb5aE1XgUTP
    WEAVIATE_URL = https://new-test-cluster-i49dzudl.weaviate.network
--- a/level_2/.env.template
+++ b/level_2/.env.template
@ -0,0 +1,3 @@
 OPENAI_API_KEY=sk
 WEAVIATE_URL =
 WEAVIATE_API_KEY =
--- a/level_2/Dockerfile
+++ b/level_2/Dockerfile
@ -0,0 +1,36 @@
 FROM python:3.11-slim
 # Set build argument
 ARG API_ENABLED
 # Set environment variable based on the build argument
 ENV API_ENABLED=${API_ENABLED} \
    PIP_NO_CACHE_DIR=true
 ENV PATH="${PATH}:/root/.poetry/bin"
 RUN pip install poetry
 WORKDIR /app
 COPY pyproject.toml poetry.lock /app/
 # Install the dependencies
 RUN poetry config virtualenvs.create false && \
    poetry install --no-root --no-dev
 RUN apt-get update -q && \
    apt-get install curl zip jq netcat-traditional -y -q
 RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" && \
    unzip -qq awscliv2.zip && ./aws/install && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
 #RUN playwright install
 #RUN playwright install-deps
 WORKDIR /app
 COPY . /app
 COPY entrypoint.sh /app/entrypoint.sh
 RUN chmod +x /app/entrypoint.sh
 ENTRYPOINT ["/app/entrypoint.sh"]
--- a/level_2/Readme.md
+++ b/level_2/Readme.md
@ -0,0 +1,44 @@
 ## PromethAI Memory Manager
 ### Description
 Initial code lets you do three operations:
 1. Add to memory
 2. Retrieve from memory
 3. Structure the data to schema and load to duckdb
 #How to use
 ## Installation
 ```docker compose build promethai_mem   ```
 ## Run
 ```docker compose up promethai_mem   ```
 ## Usage
 The fast API endpoint accepts prompts and PDF files and returns a JSON object with the generated text.
 ```curl                                                                    
    -X POST                                                                                             
    -F "prompt=The quick brown fox"                                                                     
    -F "file=@/path/to/file.pdf"                                                                       
    http://localhost:8000/upload/                                                                    
 ```
 {
  "payload": {
    "user_id": "681",
    "session_id": "471",
    "model_speed": "slow",
    "prompt": "Temperature=Cold;Food Type=Ice Cream",
    "pdf_url": "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf"
  }
 }
--- a/level_2/api.py
+++ b/level_2/api.py
@ -0,0 +1,230 @@
 from langchain.document_loaders import PyPDFLoader
 from level_2_pdf_vectorstore__dlt_contracts import Memory
 from fastapi import FastAPI
 from fastapi.responses import JSONResponse
 from pydantic import BaseModel
 from typing import Dict, Any
 import re
 import json
 import logging
 import os
 import uvicorn
 from fastapi import Request
 import yaml
 from fastapi import HTTPException
 from fastapi import FastAPI, UploadFile, File
 from typing import List
 import requests
 # Set up logging
 logging.basicConfig(
    level=logging.INFO,  # Set the logging level (e.g., DEBUG, INFO, WARNING, ERROR, CRITICAL)
    format="%(asctime)s [%(levelname)s] %(message)s",  # Set the log message format
 )
 logger = logging.getLogger(__name__)
 from dotenv import load_dotenv
 load_dotenv()
 app = FastAPI(debug=True)
 from fastapi import Depends
 class ImageResponse(BaseModel):
    success: bool
    message: str
@app.get("/", )
 async def root():
    """
    Root endpoint that returns a welcome message.
    """
    return {"message": "Hello, World, I am alive!"}
@app.get("/health")
 def health_check():
    """
    Health check endpoint that returns the server status.
    """
    return {"status": "OK"}
 #curl -X POST -H "Content-Type: application/json" -d '{"data": "YourPayload"}' -F "files=@/path/to/your/pdf/file.pdf" http://127.0.0.1:8000/upload/
 from fastapi import FastAPI, UploadFile, File
 import requests
 import os
 import json
 app = FastAPI()
 from io import BytesIO
 class Payload(BaseModel):
    payload: Dict[str, Any]
@app.post("/upload/", response_model=dict)
 async def upload_pdf_and_payload(
        payload: Payload,
        # files: List[UploadFile] = File(...),
 ):
    try:
        # Process the payload
        decoded_payload = payload.payload
    # except:
    #     pass
    #
    # return JSONResponse(content={"response": decoded_payload}, status_code=200)
        # Download the remote PDF if URL is provided
        if 'pdf_url' in decoded_payload:
            pdf_response = requests.get(decoded_payload['pdf_url'])
            pdf_content = pdf_response.content
            logging.info("Downloaded PDF from URL")
            # Create an in-memory file-like object for the PDF content
            pdf_stream = BytesIO(pdf_content)
            contents = pdf_stream.read()
            tmp_location = os.path.join('/tmp', "tmp.pdf")
            with open(tmp_location, 'wb') as tmp_file:
                tmp_file.write(contents)
            logging.info("Wrote PDF from URL")
            # Process the PDF using PyPDFLoader
            loader = PyPDFLoader(tmp_location)
            pages = loader.load_and_split()
            logging.info(" PDF split into pages")
            Memory_ = Memory(index_name="my-agent", user_id='555' )
            await Memory_.async_init()
            Memory_._run_buffer(user_input="I want to get a schema for my data", content =pages)
            # Run the buffer
            response = Memory_._run_buffer(user_input="I want to get a schema for my data")
            return JSONResponse(content={"response": response}, status_code=200)
            #to do: add the user id to the payload
            #to do add the raw pdf to payload
            # bb = await Memory_._run_buffer(user_input=decoded_payload['prompt'])
            # print(bb)
    except Exception as e:
        return {"error": str(e)}
            # Here you can perform your processing on the PDF contents
            # results.append({"filename": file.filename, "size": len(contents)})
            # Append the in-memory file to the files list
            # files.append(UploadFile(pdf_stream, filename="downloaded.pdf"))
    #
    #     # Process each uploaded PDF file
    #     results = []
    #     for file in files:
    #         contents = await file.read()
    #         tmp_location = os.path.join('/tmp', "tmp.pdf")
    #         with open(tmp_location, 'wb') as tmp_file:
    #             tmp_file.write(contents)
    #         loader = PyPDFLoader(tmp_location)
    #         pages = loader.load_and_split()
    #
    #         stm = ShortTermMemory(user_id=decoded_payload['user_id'])
    #         stm.episodic_buffer.main_buffer(prompt=decoded_payload['prompt'], pages=pages)
    #         # Here you can perform your processing on the PDF contents
    #         results.append({"filename": file.filename, "size": len(contents)})
    #
    #     return {"message": "Upload successful", "results": results}
    #
    # except Exception as e:
    #     return {"error": str(e)}
 # @app.post("/clear-cache", response_model=dict)
 # async def clear_cache(request_data: Payload) -> dict:
 #     """
 #     Endpoint to clear the cache.
 #
 #     Parameters:
 #     request_data (Payload): The request data containing the user and session IDs.
 #
 #     Returns:
 #     dict: A dictionary with a message indicating the cache was cleared.
 #     """
 #     json_payload = request_data.payload
 #     agent = Agent()
 #     agent.set_user_session(json_payload["user_id"], json_payload["session_id"])
 #     try:
 #         agent.clear_cache()
 #         return JSONResponse(content={"response": "Cache cleared"}, status_code=200)
 #     except Exception as e:
 #         raise HTTPException(status_code=500, detail=str(e))
 #
 # @app.post("/correct-prompt-grammar", response_model=dict)
 # async def prompt_to_correct_grammar(request_data: Payload) -> dict:
 #     json_payload = request_data.payload
 #     agent = Agent()
 #     agent.set_user_session(json_payload["user_id"], json_payload["session_id"])
 #     logging.info("Correcting grammar %s", json_payload["prompt_source"])
 #
 #     output = agent.prompt_correction(json_payload["prompt_source"], model_speed= json_payload["model_speed"])
 #     return JSONResponse(content={"response": {"result": json.loads(output)}})
 # @app.post("/action-add-zapier-calendar-action", response_model=dict,dependencies=[Depends(auth)])
 # async def action_add_zapier_calendar_action(
 #     request: Request, request_data: Payload
 # ) -> dict:
 #     json_payload = request_data.payload
 #     agent = Agent()
 #     agent.set_user_session(json_payload["user_id"], json_payload["session_id"])
 #     # Extract the bearer token from the header
 #     auth_header = request.headers.get("Authorization")
 #     if auth_header:
 #         bearer_token = auth_header.replace("Bearer ", "")
 #     else:
 #         bearer_token = None
 #     outcome = agent.add_zapier_calendar_action(
 #         prompt_base=json_payload["prompt_base"],
 #         token=bearer_token,
 #         model_speed=json_payload["model_speed"],
 #     )
 #     return JSONResponse(content={"response": outcome})
 def start_api_server(host: str = "0.0.0.0", port: int = 8000):
    """
    Start the API server using uvicorn.
    Parameters:
    host (str): The host for the server.
    port (int): The port for the server.
    """
    try:
        logger.info(f"Starting server at {host}:{port}")
        uvicorn.run(app, host=host, port=port)
    except Exception as e:
        logger.exception(f"Failed to start server: {e}")
        # Here you could add any cleanup code or error recovery code.
 if __name__ == "__main__":
    start_api_server()
--- a/level_2/docker-compose.yml
+++ b/level_2/docker-compose.yml
@ -0,0 +1,22 @@
 version: "3.9"
 services:
  promethai_mem:
    networks:
      - promethai_mem_backend
    build:
      context: ./
    volumes:
      - "./:/app"
    environment:
      - HOST=0.0.0.0
    profiles: ["exclude-from-up"] # Use `docker-compose run teenage-agi` to get an attached container
    ports:
      - 8000:8000
      - 443:443
 networks:
  promethai_mem_backend:
    name: promethai_mem_backend
--- a/level_2/entrypoint.sh
+++ b/level_2/entrypoint.sh
@ -0,0 +1,6 @@
 #!/bin/bash
 export ENVIRONMENT
 #python fetch_secret.py
 # Start Gunicorn
 gunicorn -w 2 -k uvicorn.workers.UvicornWorker -t 120 --bind=0.0.0.0:8000 --bind=0.0.0.0:443 --log-level debug  api:app
--- a/level_2/level_2_pdf_vectorstore__dlt_contracts.py
+++ b/level_2/level_2_pdf_vectorstore__dlt_contracts.py
@ -0,0 +1,711 @@
 #Make sure to install the following packages: dlt, langchain, duckdb, python-dotenv, openai, weaviate-client
 import dlt
 from langchain import PromptTemplate, LLMChain
 from langchain.agents import initialize_agent, AgentType
 from langchain.chains.openai_functions import create_structured_output_chain
 from langchain.chat_models import ChatOpenAI
 from langchain.document_loaders import PyPDFLoader
 import weaviate
 import os
 import json
 import asyncio
 from typing import Any, Dict, List
 from deep_translator import (GoogleTranslator)
 from langchain.chat_models import ChatOpenAI
 from langchain.schema import LLMResult, HumanMessage
 from langchain.callbacks.base import AsyncCallbackHandler, BaseCallbackHandler
 from langchain.memory import VectorStoreRetrieverMemory
 from marvin import ai_classifier
 from enum import Enum
 import marvin
 import asyncio
 from langchain.embeddings import OpenAIEmbeddings
 from langchain.prompts import HumanMessagePromptTemplate, ChatPromptTemplate
 from langchain.retrievers import WeaviateHybridSearchRetriever
 from langchain.schema import Document, SystemMessage, HumanMessage, LLMResult
 from langchain.tools import tool
 from langchain.vectorstores import Weaviate
 import uuid
 from dotenv import load_dotenv
 load_dotenv()
 from pathlib import Path
 from langchain import OpenAI, LLMMathChain
 from langchain.chat_models import ChatOpenAI
 from langchain.prompts import ChatPromptTemplate
 import os
 from datetime import datetime
 import os
 from datetime import datetime
 from jinja2 import Template
 from langchain import PromptTemplate, LLMChain
 from langchain.chains.openai_functions import create_structured_output_chain
 from langchain.prompts import HumanMessagePromptTemplate, ChatPromptTemplate
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 import pinecone
 from langchain.vectorstores import Pinecone
 from langchain.embeddings.openai import OpenAIEmbeddings
 from pydantic import BaseModel, Field
 from dotenv import load_dotenv
 from langchain.schema import Document, SystemMessage, HumanMessage
 from langchain.vectorstores import Weaviate
 import weaviate
 import uuid
 load_dotenv()
 class MyCustomSyncHandler(BaseCallbackHandler):
    def on_llm_new_token(self, token: str, **kwargs) -> None:
        print(f"Sync handler being called in a `thread_pool_executor`: token: {token}")
 class MyCustomAsyncHandler(AsyncCallbackHandler):
    """Async callback handler that can be used to handle callbacks from langchain."""
    async def on_llm_start(
            self, serialized: Dict[str, Any], prompts: List[str], **kwargs: Any
    ) -> None:
        """Run when chain starts running."""
        print("zzzz....")
        await asyncio.sleep(0.3)
        class_name = serialized["name"]
        print("Hi! I just woke up. Your llm is starting")
    async def on_llm_end(self, response: LLMResult, **kwargs: Any) -> None:
        """Run when chain ends running."""
        print("zzzz....")
        await asyncio.sleep(0.3)
        print("Hi! I just woke up. Your llm is ending")
 class VectorDB:
    OPENAI_TEMPERATURE = float(os.getenv("OPENAI_TEMPERATURE", 0.0))
    OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
    def __init__(self, user_id: str, index_name: str, memory_id:str, ltm_memory_id:str='00000', st_memory_id:str='0000', buffer_id:str='0000', db_type: str = "pinecone",  namespace:str = None):
        self.user_id = user_id
        self.index_name = index_name
        self.db_type = db_type
        self.namespace=namespace
        self.memory_id = memory_id
        self.ltm_memory_id = ltm_memory_id
        self.st_memory_id = st_memory_id
        self.buffer_id = buffer_id
        # if self.db_type == "pinecone":
        #     self.vectorstore = self.init_pinecone(self.index_name)
        if self.db_type == "weaviate":
            self.init_weaviate(namespace=self.namespace)
        else:
            raise ValueError(f"Unsupported database type: {db_type}")
        if self.db_type == "weaviate":
            self.init_weaviate_client(namespace=self.namespace)
        else:
            raise ValueError(f"Unsupported VectorDB client type: {db_type}")
        load_dotenv()
    def init_pinecone(self, index_name):
        load_dotenv()
        PINECONE_API_KEY = os.getenv("PINECONE_API_KEY", "")
        PINECONE_API_ENV = os.getenv("PINECONE_API_ENV", "")
        pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_API_ENV)
        pinecone.Index(index_name)
        vectorstore: Pinecone = Pinecone.from_existing_index(
            index_name=self.index_name,
            embedding=OpenAIEmbeddings(),
            namespace='RESULT'
        )
        return vectorstore
    def init_weaviate_client(self, namespace:str):
        embeddings = OpenAIEmbeddings()
        auth_config = weaviate.auth.AuthApiKey(api_key=os.environ.get('WEAVIATE_API_KEY'))
        client = weaviate.Client(
            url=os.environ.get('WEAVIATE_URL'),
            auth_client_secret=auth_config,
            additional_headers={
                "X-OpenAI-Api-Key": os.environ.get('OPENAI_API_KEY')
            }
        )
        return client
    def init_weaviate(self, namespace:str):
        embeddings = OpenAIEmbeddings()
        auth_config = weaviate.auth.AuthApiKey(api_key=os.environ.get('WEAVIATE_API_KEY'))
        client = weaviate.Client(
            url=os.environ.get('WEAVIATE_URL'),
            auth_client_secret=auth_config,
            additional_headers={
                "X-OpenAI-Api-Key": os.environ.get('OPENAI_API_KEY')
            }
        )
        retriever = WeaviateHybridSearchRetriever(
            client=client,
            index_name=namespace,
            text_key="text",
            attributes=[],
            embedding=embeddings,
            create_schema_if_missing=True,
        )
        return retriever
    async def add_memories(self, observation: str,  page: str = "", source: str = ""):
        if self.db_type == "pinecone":
            # Update Pinecone memories here
            vectorstore: Pinecone = Pinecone.from_existing_index(
                index_name=self.index_name, embedding=OpenAIEmbeddings(), namespace=self.namespace
            )
            retriever = vectorstore.as_retriever()
            retriever.add_documents(
                [
                    Document(
                        page_content=observation,
                        metadata={
                            "inserted_at": datetime.now(),
                            "text": observation,
                            "user_id": self.user_id,
                            "page": page,
                            "source": source
                        },
                        namespace=self.namespace,
                    )
                ]
            )
        elif self.db_type == "weaviate":
            # Update Weaviate memories here
            print(self.namespace)
            retriever = self.init_weaviate( self.namespace)
            return  retriever.add_documents([
                Document(
                    metadata={
                        "inserted_at": str(datetime.now()),
                        "text": observation,
                        "user_id": str(self.user_id),
                        "memory_id": str(self.memory_id),
                        "ltm_memory_id": str(self.ltm_memory_id),
                        "st_memory_id": str(self.st_memory_id),
                        "buffer_id": str(self.buffer_id),
                        "last_accessed_at": str(datetime.now()),
                        # **source_metadata,
                    },
                    page_content=observation,
                )]
            )
    # def get_pinecone_vectorstore(self, namespace: str) -> pinecone.VectorStore:
    #     return Pinecone.from_existing_index(
    #         index_name=self.index, embedding=OpenAIEmbeddings(), namespace=namespace
    #     )
    async def fetch_memories(self, observation: str, params = None):
        if self.db_type == "pinecone":
            # Fetch Pinecone memories here
            pass
        elif self.db_type == "weaviate":
            # Fetch Weaviate memories here
            """
            Get documents from weaviate.
            Args a json containing:
                query (str): The query string.
                path (list): The path for filtering, e.g., ['year'].
                operator (str): The operator for filtering, e.g., 'Equal'.
                valueText (str): The value for filtering, e.g., '2017*'.
            Example:
                get_from_weaviate(query="some query", path=['year'], operator='Equal', valueText='2017*')
            """
            retriever = self.init_weaviate(self.namespace)
            print(self.namespace)
            print(str(datetime.now()))
            print(observation)
            # Retrieve documents with filters applied
            output = retriever.get_relevant_documents(
                observation
                # ,
                # score=True
                # ,
                # where_filter=params
            )
            print(output)
            return output
    # def
    def delete_memories(self, params: None):
        auth_config = weaviate.auth.AuthApiKey(api_key=os.environ.get('WEAVIATE_API_KEY'))
        client = weaviate.Client(
            url=os.environ.get('WEAVIATE_API_KEY'),
            auth_client_secret=auth_config,
            additional_headers={
                "X-OpenAI-Api-Key": os.environ.get('OPENAI_API_KEY')
            }
        )
        client.batch.delete_objects(
            class_name=self.namespace,
            # Same `where` filter as in the GraphQL API
            where=params,
        )
    def update_memories(self):
        pass
 class SemanticMemory:
    def __init__(self, user_id: str, memory_id:str, ltm_memory_id:str, index_name: str, db_type:str="weaviate", namespace:str="SEMANTICMEMORY"):
        # Add any semantic memory-related attributes or setup here
        self.user_id=user_id
        self.index_name = index_name
        self.namespace = namespace
        self.semantic_memory_id = str(uuid.uuid4())
        self.memory_id = memory_id
        self.ltm_memory_id = ltm_memory_id
        self.vector_db = VectorDB(user_id=user_id, memory_id= self.memory_id, ltm_memory_id = self.ltm_memory_id, index_name=index_name, db_type=db_type, namespace=self.namespace)
        self.db_type = db_type
    def _update_memories(self ,memory_id:str="None", semantic_memory: str="None") -> None:
        """Update semantic memory for the user"""
        if self.db_type == "weaviate":
            self.vector_db.add_memories( observation = semantic_memory)
        elif self.db_type == "pinecone":
            pass
    def _fetch_memories(self, observation: str,params) -> dict[str, str] | str:
        """Fetch related characteristics, preferences or dislikes for a user."""
        # self.init_pinecone(index_name=self.index)
        if self.db_type == "weaviate":
            return self.vector_db.fetch_memories(observation, params)
        elif self.db_type == "pinecone":
            pass
 class LongTermMemory:
    def __init__(self, user_id: str = "676", memory_id:str=None, index_name: str = None, namespace:str=None, db_type:str="weaviate"):
        self.user_id = user_id
        self.memory_id = memory_id
        self.ltm_memory_id = str(uuid.uuid4())
        self.index_name = index_name
        self.namespace = namespace
        self.db_type = db_type
        # self.episodic_memory = EpisodicMemory()
        self.semantic_memory = SemanticMemory(user_id = self.user_id, memory_id=self.memory_id, ltm_memory_id = self.ltm_memory_id, index_name=self.index_name, db_type=self.db_type)
 class ShortTermMemory:
    def __init__(self, user_id: str = "676", memory_id:str=None, index_name: str = None, namespace:str=None, db_type:str="weaviate"):
        # Add any short-term memory-related attributes or setup here
        self.user_id = user_id
        self.memory_id = memory_id
        self.namespace = namespace
        self.db_type = db_type
        self.stm_memory_id = str(uuid.uuid4())
        self.index_name = index_name
        self.episodic_buffer = EpisodicBuffer(user_id=self.user_id, memory_id=self.memory_id, index_name=self.index_name, db_type=self.db_type)
 class EpisodicBuffer:
    def __init__(self, user_id: str = "676", memory_id:str=None, index_name: str = None, namespace:str='EPISODICBUFFER', db_type:str="weaviate"):
        # Add any short-term memory-related attributes or setup here
        self.user_id = user_id
        self.memory_id = memory_id
        self.namespace = namespace
        self.db_type = db_type
        self.st_memory_id = "blah"
        self.index_name = index_name
        self.llm= ChatOpenAI(
            temperature=0.0,
            max_tokens=1200,
            openai_api_key=os.environ.get('OPENAI_API_KEY'),
            model_name="gpt-4-0613",
            callbacks=[MyCustomSyncHandler(), MyCustomAsyncHandler()],
        )
        # self.vector_db = VectorDB(user_id=user_id, memory_id= self.memory_id, st_memory_id = self.st_memory_id, index_name=index_name, db_type=db_type, namespace=self.namespace)
        def _compute_weights(self, context: str):
            """Computes the weights for the buffer"""
            pass
        def _temporal_weighting(self, context: str):
            """Computes the temporal weighting for the buffer"""
            pass
    # async def infer_schema_from_text(self, text: str):
    #     """Infer schema from text"""
    #
    #     prompt_ = """ You are a json schema master. Create a JSON schema based on the following data and don't write anything else: {prompt} """
    #
    #     complete_query = PromptTemplate(
    #         input_variables=["prompt"],
    #         template=prompt_,
    #     )
    #
    #     chain = LLMChain(
    #         llm=self.llm, prompt=complete_query, verbose=True
    #     )
    #     chain_result = chain.run(prompt=text).strip()
    #
    #     json_data = json.dumps(chain_result)
    #     return json_data
    async def _fetch_memories(self, observation: str,namespace:str) ->  str:
        vector_db = VectorDB(user_id=self.user_id, memory_id=self.memory_id, st_memory_id=self.st_memory_id,
                             index_name=self.index_name, db_type=self.db_type, namespace=namespace)
        query = await vector_db.fetch_memories(observation=observation)
        return query
    async def _add_memories(self, observation: str,namespace:str):
        vector_db = VectorDB(user_id=self.user_id, memory_id=self.memory_id, st_memory_id=self.st_memory_id,
                             index_name=self.index_name, db_type=self.db_type, namespace=namespace)
        query = await vector_db.add_memories(observation)
        return query
    def encoding(self, document: str, namespace: str = "EPISODICBUFFER") -> None:
        """Encoding for the buffer, stores raw data in the buffer
        Note, this is not comp-sci encoding, but rather encoding in the sense of storing the content in the buffer"""
        vector_db = VectorDB(user_id=self.user_id, memory_id=self.memory_id, st_memory_id=self.st_memory_id,
                             index_name=self.index_name, db_type=self.db_type, namespace=namespace)
        query = vector_db.add_memories(document)
        return query
    async def main_buffer(self, user_input=None, content=None):
        """AI buffer to convert unstructured data to structured data"""
        # Here we define the user prompt and the structure of the output we desire
        # prompt = output[0].page_content
        if content is not None:
            #We need to encode the content. Note, this is not comp-sci encoding, but rather encoding in the sense of storing the content in the buffer
            self.encoding(content)
            prompt_filter = ChatPromptTemplate.from_template("Filter and remove uneccessary information that is not relevant in the user query {query}")
            chain_filter = prompt_filter | self.llm
            output = await chain_filter.ainvoke({"query": user_input})
            print(output)
        if content is None:
            # Sensory and Linguistic Processing
            prompt_filter = ChatPromptTemplate.from_template("Filter and remove uneccessary information that is not relevant in the user query {query}")
            chain_filter = prompt_filter | self.llm
            output = await chain_filter.ainvoke({"query": user_input})
            translation = GoogleTranslator(source='auto', target='en').translate(text=output.content)
            def top_down_processing():
                """Top-down processing"""
            def bottom_up_processing():
                """Bottom-up processing"""
                pass
            def interactive_processing():
                """interactive processing"""
                pass
            working_memory_activation =  "bla"
            prompt_chunk = ChatPromptTemplate.from_template("Can you break down the instruction 'Structure a PDF and load it into duckdb' into smaller tasks or actions? Return only tasks or actions. Be brief")
            chain_chunk = prompt_chunk | self.llm
            output_chunks = await chain_chunk.ainvoke({"query": output.content})
            print(output_chunks.content)
        # vectorstore = Weaviate.from_documents(documents, embeddings, client=client, by_text=False)
        # retriever = WeaviateHybridSearchRetriever(
        #     client=client,
        #     index_name="EVENTBUFFER",
        #     text_key="text",
        #     attributes=[],
        #     embedding=embeddings,
        #     create_schema_if_missing=True,
        # )
        # vector_db = VectorDB(user_id=self.user_id, memory_id=self.memory_id, st_memory_id=self.st_memory_id,
        #                      index_name=self.index_name, db_type=self.db_type, namespace="EVENTBUFFER")
        # query = vector_db.
        # retriever = vectorstore.as_retriever(search_kwargs=dict(k=1))
        # memory = VectorStoreRetrieverMemory(retriever=retriever)
        # class PromptWrapper(BaseModel):
        #     observation: str = Field(
        #         description="observation we want to fetch from vectordb"
        #     )
        #         # ,
        #     # json_schema: str = Field(description="json schema we want to infer")
        # @tool("convert_to_structured", args_schema=PromptWrapper, return_direct=True)
        # def convert_to_structured( observation=None, json_schema=None):
        #     """Convert unstructured data to structured data"""
        #     BASE_DIR = os.getcwd()
        #     json_path = os.path.join(BASE_DIR, "schema_registry", "ticket_schema.json")
        #
        #     def load_json_or_infer_schema(file_path, document_path):
        #         """Load JSON schema from file or infer schema from text"""
        #
        #         # Attempt to load the JSON file
        #         with open(file_path, 'r') as file:
        #             json_schema = json.load(file)
        #         return json_schema
        #
        #     json_schema =load_json_or_infer_schema(json_path, None)
        #     def run_open_ai_mapper(observation=None, json_schema=None):
        #         """Convert unstructured data to structured data"""
        #
        #         prompt_msgs = [
        #             SystemMessage(
        #                 content="You are a world class algorithm converting unstructured data into structured data."
        #             ),
        #             HumanMessage(content="Convert unstructured data to structured data:"),
        #             HumanMessagePromptTemplate.from_template("{input}"),
        #             HumanMessage(content="Tips: Make sure to answer in the correct format"),
        #         ]
        #         prompt_ = ChatPromptTemplate(messages=prompt_msgs)
        #         chain_funct = create_structured_output_chain(json_schema, prompt=prompt_, llm=self.llm, verbose=True)
        #         output = chain_funct.run(input=observation, llm=self.llm)
        #         yield output
        #     pipeline = dlt.pipeline(pipeline_name="train_ticket", destination='duckdb', dataset_name='train_ticket_data')
        #     info = pipeline.run(data=run_open_ai_mapper(prompt, json_schema))
        #     return print(info)
        #
        #
        # class GoalWrapper(BaseModel):
        #     observation: str = Field(
        #         description="observation we want to fetch from vectordb"
        #     )
        #
        # @tool("fetch_memory_wrapper", args_schema=GoalWrapper, return_direct=True)
        # def fetch_memory_wrapper(observation, args_schema=GoalWrapper):
        #     """Fetches data from the VectorDB and returns it as a python dictionary."""
        #     print("HELLO, HERE IS THE OBSERVATION: ", observation)
        #
        #     marvin.settings.openai.api_key = os.environ.get('OPENAI_API_KEY')
        #     @ai_classifier
        #     class MemoryRoute(Enum):
        #         """Represents distinct routes  for  different memory types."""
        #
        #         storage_of_documents_and_knowledge_to_memory = "SEMANTICMEMORY"
        #         raw_information_currently_processed_in_short_term_memory = "EPISODICBUFFER"
        #         raw_information_kept_in_short_term_memory = "SHORTTERMMEMORY"
        #         long_term_recollections_of_past_events_and_emotions = "EPISODICMEMORY"
        #         raw_information_to_store_as_events = "EVENTBUFFER"
        #
        #     namespace= MemoryRoute(observation)
        #     vector_db = VectorDB(user_id=self.user_id, memory_id=self.memory_id, st_memory_id=self.st_memory_id,
        #                          index_name=self.index_name, db_type=self.db_type, namespace=namespace.value)
        #
        #
        #     query = vector_db.fetch_memories(observation)
        #
        #     return query
        #
        # class UpdatePreferences(BaseModel):
        #     observation: str = Field(
        #         description="observation we want to fetch from vectordb"
        #     )
        #
        # @tool("add_memories_wrapper", args_schema=UpdatePreferences, return_direct=True)
        # def add_memories_wrapper(observation, args_schema=UpdatePreferences):
        #     """Updates user preferences in the VectorDB."""
        #     @ai_classifier
        #     class MemoryRoute(Enum):
        #         """Represents distinct routes  for  different memory types."""
        #
        #         storage_of_documents_and_knowledge_to_memory = "SEMANTICMEMORY"
        #         raw_information_currently_processed_in_short_term_memory = "EPISODICBUFFER"
        #         raw_information_kept_in_short_term_memory = "SHORTTERMMEMORY"
        #         long_term_recollections_of_past_events_and_emotions = "EPISODICMEMORY"
        #         raw_information_to_store_as_events = "EVENTBUFFER"
        #
        #     namespace= MemoryRoute(observation)
        #     print("HELLO, HERE IS THE OBSERVATION 2: ")
        #     vector_db = VectorDB(user_id=self.user_id, memory_id=self.memory_id, st_memory_id=self.st_memory_id,
        #                          index_name=self.index_name, db_type=self.db_type, namespace=namespace.value)
        #     return vector_db.add_memories(observation)
        #
        # agent = initialize_agent(
        #     llm=self.llm,
        #     tools=[convert_to_structured,fetch_memory_wrapper, add_memories_wrapper],
        #     agent=AgentType.OPENAI_FUNCTIONS,
        #
        #     verbose=True,
        # )
        #
        # prompt = """
        #     Based on all the history and information of this user, decide based on user query query: {query} which of the following tasks needs to be done:
        #     1. Memory retrieval , 2. Memory update,  3. Convert data to structured   If the query is not any of these, then classify it as 'Other'
        #     Return the result in format:  'Result_type': 'Goal', "Original_query": "Original query"
        #     """
        #
        # # template = Template(prompt)
        # # output = template.render(query=user_input)
        # # complete_query = output
        # complete_query = PromptTemplate(
        #     input_variables=[ "query"], template=prompt
        # )
        # summary_chain = LLMChain(
        #     llm=self.llm, prompt=complete_query, verbose=True
        # )
        # from langchain.chains import SimpleSequentialChain
        #
        # overall_chain = SimpleSequentialChain(
        #     chains=[summary_chain, agent], verbose=True
        # )
        # output = overall_chain.run(user_input)
        # return output
 #DEFINE STM
 #DEFINE LTM
 class Memory:
    load_dotenv()
    OPENAI_TEMPERATURE = float(os.getenv("OPENAI_TEMPERATURE", 0.0))
    OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
    def __init__(self, user_id: str = "676", index_name: str = None, knowledge_source: str = None,
                 knowledge_type: str = None, db_type:str="weaviate", namespace:str=None) -> None:
        self.user_id = user_id
        self.index_name = index_name
        self.db_type = db_type
        self.knowledge_source = knowledge_source
        self.knowledge_type = knowledge_type
        self.memory_id = str(uuid.uuid4())
        self.long_term_memory = None
        self.short_term_memory = None
        self.namespace = namespace
        load_dotenv()
    # Asynchronous factory function for creating LongTermMemory
    async def async_create_long_term_memory(self,user_id, memory_id, index_name, namespace, db_type):
        # Perform asynchronous initialization steps if needed
        return LongTermMemory(
            user_id=user_id, memory_id=memory_id, index_name=index_name,
            namespace=namespace, db_type=db_type
        )
    async def async_init(self):
        # Asynchronous initialization of LongTermMemory and ShortTermMemory
        self.long_term_memory = await self.async_create_long_term_memory(
            user_id=self.user_id, memory_id=self.memory_id, index_name=self.index_name,
            namespace=self.namespace, db_type=self.db_type
        )
    async def async_create_short_term_memory(self, user_id, memory_id, index_name, db_type):
        # Perform asynchronous initialization steps if needed
        return ShortTermMemory(
            user_id=user_id, memory_id=memory_id, index_name=index_name, db_type=db_type
        )
    async def async_init(self):
        # Asynchronous initialization of LongTermMemory and ShortTermMemory
        self.long_term_memory = await self.async_create_long_term_memory(
            user_id=self.user_id, memory_id=self.memory_id, index_name=self.index_name,
            namespace=self.namespace, db_type=self.db_type
        )
        self.short_term_memory = await self.async_create_short_term_memory(
            user_id=self.user_id, memory_id=self.memory_id, index_name=self.index_name,
            db_type=self.db_type
        )
        # self.short_term_memory = await ShortTermMemory.async_init(
        #     user_id=self.user_id, memory_id=self.memory_id, index_name=self.index_name, db_type=self.db_type
        # )
    def _update_semantic_memory(self, semantic_memory:str):
        return self.long_term_memory.semantic_memory._update_memories(
            memory_id=self.memory_id,
            semantic_memory=semantic_memory
        )
    def _fetch_semantic_memory(self, observation, params):
        return self.long_term_memory.semantic_memory._fetch_memories(
            observation=observation, params=params
        )
    async def _run_buffer(self, user_input:str):
        return await self.short_term_memory.episodic_buffer.main_buffer(user_input=user_input)
    async def _add_memories_buffer(self, user_input: str, namespace: str = None ):
        return await self.short_term_memory.episodic_buffer._add_memories(observation=user_input, namespace=namespace)
    async def _fetch_memories_buffer(self, user_input: str, namespace: str = None ):
        return await self.short_term_memory.episodic_buffer._fetch_memories(observation=user_input, namespace=namespace)
 async def main():
    memory = Memory(user_id="123")
    await memory.async_init()
    # gg = await memory._run_buffer(user_input="I want to get a my past data from 2017")
    ggur = await memory._add_memories_buffer(user_input = "bla bla bla", namespace="test")
    print(ggur)
    fff = await memory._fetch_memories_buffer(user_input = "bla bla bla", namespace="Test")
    print(fff)
 if __name__ == "__main__":
    import asyncio
    asyncio.run(main())
    # bb = agent._update_semantic_memory(semantic_memory="Users core summary")
    # bb = agent._fetch_semantic_memory(observation= "Users core summary", params =    {
    #     "path": ["inserted_at"],
    #     "operator": "Equal",
    #     "valueText": "*2023*"
    # })
    # buffer = agent._run_buffer(user_input="I want to get a schema for my data")
    # print(bb)
    # rrr = {
    #     "path": ["year"],
    #     "operator": "Equal",
    #     "valueText": "2017*"
    # }
--- a/level_2/personal_receipts/2017/de/public_transport/118NP8.pdf
+++ b/level_2/personal_receipts/2017/de/public_transport/118NP8.pdf
--- a/level_2/personal_receipts/2017/de/public_transport/3ZCCCW.pdf
+++ b/level_2/personal_receipts/2017/de/public_transport/3ZCCCW.pdf
--- a/level_2/personal_receipts/2017/de/public_transport/4GBEC9.pdf
+++ b/level_2/personal_receipts/2017/de/public_transport/4GBEC9.pdf
--- a/level_2/personal_receipts/2017/de/public_transport/96W2GF.pdf
+++ b/level_2/personal_receipts/2017/de/public_transport/96W2GF.pdf
--- a/level_2/poetry.lock
+++ b/level_2/poetry.lock
--- a/level_2/pyproject.toml
+++ b/level_2/pyproject.toml
@ -0,0 +1,48 @@
 [tool.poetry]
 name = "PromethAI_memory"
 version = "0.1.0"
 description = "PromethAI memory manager"
 authors = ["Vasilije Markovic"]
 readme = "README.md"
 [tool.poetry.dependencies]
 python = "^3.10"
 #langchain = {git = "https://github.com/topoteretes/langchain.git" , tag = "v0.0.209"}
 langchain = "v0.0.250"
 nltk = "3.8.1"
 openai = "0.27.8"
 pinecone-client = "2.2.2"
 python-dotenv = "1.0.0"
 pyyaml = "6.0"
 fastapi = "0.98.0"
 uvicorn = "0.22.0"
 googlemaps = "4.10.0"
 jinja2 = "3.1.2"
 replicate = "^0.8.4"
 pexpect = "^4.8.0"
 selenium = "^4.9.0"
 playwright = "^1.32.1"
 pytest-playwright = "^0.3.3"
 boto3 = "^1.26.125"
 gptcache = "^0.1.22"
 redis = "^4.5.5"
 gunicorn = "^20.1.0"
 tiktoken = "^0.4.0"
 google-search-results = "^2.4.2"
 spacy = "^3.5.3"
 python-jose = "^3.3.0"
 pypdf = "^3.12.0"
 fastjsonschema = "^2.18.0"
 marvin = "^1.3.0"
 dlt = { version ="^0.3.8" , extras = ["duckdb"]}
 weaviate-client = "^3.22.1"
 python-multipart = "^0.0.6"
 [build-system]
 requires = ["poetry-core"]
 build-backend = "poetry.core.masonry.api"
--- a/level_2/schema_registry/ticket_schema.json
+++ b/level_2/schema_registry/ticket_schema.json
@ -0,0 +1,180 @@
 {
  "$schema": "http://json-schema.org/draft-07/schema#",
  "type": "object",
  "properties": {
    "ticketType": {
      "type": "string",
      "enum": ["online ticket", "ICE ticket"]
    },
    "departureDate": {
      "type": "string",
      "format": "date"
    },
    "priceType": {
      "type": "string",
      "enum": ["Flex price (single journey)"]
    },
    "class": {
      "type": "integer",
      "enum": [1]
    },
    "adult": {
      "type": "object",
      "properties": {
        "quantity": {
          "type": "integer"
        },
        "BC50": {
          "type": "integer"
        }
      },
      "required": ["quantity", "BC50"]
    },
    "journey": {
      "type": "object",
      "properties": {
        "from": {
          "type": "string"
        },
        "to": {
          "type": "string"
        },
        "via": {
          "type": "string"
        },
        "train": {
          "type": "string",
          "enum": ["ICE"]
        }
      },
      "required": ["from", "to", "via", "train"]
    },
    "refundPolicy": {
      "type": "string"
    },
    "payment": {
      "type": "object",
      "properties": {
        "items": {
          "type": "array",
          "items": {
            "type": "object",
            "properties": {
              "name": {
                "type": "string"
              },
              "quantity": {
                "type": "integer"
              },
              "price": {
                "type": "number"
              },
              "vat19": {
                "type": "number"
              },
              "vat7": {
                "type": "number"
              }
            },
            "required": ["name", "quantity", "price", "vat19", "vat7"]
          }
        },
        "total": {
          "type": "number"
        },
        "method": {
          "type": "string",
          "enum": ["credit card"]
        },
        "transactionDetails": {
          "type": "object",
          "properties": {
            "amount": {
              "type": "number"
            },
            "VUNumber": {
              "type": "integer"
            },
            "transactionNumber": {
              "type": "integer"
            },
            "date": {
              "type": "string",
              "format": "date"
            },
            "genNumber": {
              "type": "string"
            }
          },
          "required": ["amount", "VUNumber", "transactionNumber", "date", "genNumber"]
        }
      },
      "required": ["items", "total", "method", "transactionDetails"]
    },
    "bookingDetails": {
      "type": "object",
      "properties": {
        "bookingDate": {
          "type": "string",
          "format": "date-time"
        },
        "bookingAddress": {
          "type": "string"
        },
        "taxNumber": {
          "type": "string"
        }
      },
      "required": ["bookingDate", "bookingAddress", "taxNumber"]
    },
    "journeyDetails": {
      "type": "object",
      "properties": {
        "validFrom": {
          "type": "string",
          "format": "date"
        },
        "passengerName": {
          "type": "string"
        },
        "orderNumber": {
          "type": "string"
        },
        "stops": {
          "type": "array",
          "items": {
            "type": "object",
            "properties": {
              "stop": {
                "type": "string"
              },
              "date": {
                "type": "string",
                "format": "date"
              },
              "time": {
                "type": "string",
                "format": "time"
              },
              "track": {
                "type": "integer"
              },
              "product": {
                "type": "string"
              },
              "reservation": {
                "type": "string"
              }
            },
            "required": ["stop", "date", "time", "track", "product", "reservation"]
          }
        }
      },
      "required": ["validFrom", "passengerName", "orderNumber", "stops"]
    },
    "usageNotes": {
      "type": "string"
    }
  },
  "required": ["ticketType", "departureDate", "priceType", "class", "adult", "journey", "refundPolicy", "payment", "bookingDetails", "journeyDetails", "usageNotes"]
 }