diff --git a/level_2/level_2_pdf_vectorstore__dlt_contracts.py b/level_2/level_2_pdf_vectorstore__dlt_contracts.py index 8212db028..8538ef6f5 100644 --- a/level_2/level_2_pdf_vectorstore__dlt_contracts.py +++ b/level_2/level_2_pdf_vectorstore__dlt_contracts.py @@ -13,6 +13,7 @@ import asyncio from typing import Any, Dict, List, Coroutine from deep_translator import (GoogleTranslator) from langchain.chat_models import ChatOpenAI +from langchain.output_parsers import PydanticOutputParser from langchain.schema import LLMResult, HumanMessage from langchain.callbacks.base import AsyncCallbackHandler, BaseCallbackHandler from pydantic import BaseModel, Field, parse_obj_as @@ -512,14 +513,6 @@ class EpisodicBuffer: # self.vector_db = VectorDB(user_id=user_id, memory_id= self.memory_id, st_memory_id = self.st_memory_id, index_name=index_name, db_type=db_type, namespace=self.namespace) - def _compute_weights(self, context: str): - """Computes the weights for the buffer""" - pass - - def _temporal_weighting(self, context: str): - """Computes the temporal weighting for the buffer""" - pass - # async def infer_schema_from_text(self, text: str): # """Infer schema from text""" # @@ -739,7 +732,46 @@ class EpisodicBuffer: # Extract the list of tasks tasks_list = data["tasks"] + result_tasks =[] + for task in tasks_list: + class PromptWrapper(BaseModel): + observation: str = Field( + description="observation we want to fetch from vectordb" + ) + @tool("convert_to_structured", args_schema=PromptWrapper, return_direct=True) + def convert_to_structured( observation=None, json_schema=None): + """Convert unstructured data to structured data""" + BASE_DIR = os.getcwd() + json_path = os.path.join(BASE_DIR, "schema_registry", "ticket_schema.json") + + def load_json_or_infer_schema(file_path, document_path): + """Load JSON schema from file or infer schema from text""" + + # Attempt to load the JSON file + with open(file_path, 'r') as file: + json_schema = json.load(file) + return json_schema + + json_schema =load_json_or_infer_schema(json_path, None) + def run_open_ai_mapper(observation=None, json_schema=None): + """Convert unstructured data to structured data""" + + prompt_msgs = [ + SystemMessage( + content="You are a world class algorithm converting unstructured data into structured data." + ), + HumanMessage(content="Convert unstructured data to structured data:"), + HumanMessagePromptTemplate.from_template("{input}"), + HumanMessage(content="Tips: Make sure to answer in the correct format"), + ] + prompt_ = ChatPromptTemplate(messages=prompt_msgs) + chain_funct = create_structured_output_chain(json_schema, prompt=prompt_, llm=self.llm, verbose=True) + output = chain_funct.run(input=observation, llm=self.llm) + return output + + result = run_open_ai_mapper(observation, json_schema) + return result class TranslateText(BaseModel): observation: str = Field( description="observation we want to translate" @@ -753,7 +785,7 @@ class EpisodicBuffer: agent = initialize_agent( llm=self.llm, - tools=[translate_to_en], + tools=[translate_to_en, convert_to_structured], agent=AgentType.OPENAI_FUNCTIONS, verbose=True, @@ -761,25 +793,50 @@ class EpisodicBuffer: print("HERE IS THE TASK", task) output = agent.run(input=task) print(output) - await self.encoding(output) + result_tasks.append(task) + result_tasks.append(output) + + + await self.encoding(str(result_tasks), self.namespace, params=params) buffer_result = await self._fetch_memories(observation=str(output), namespace=self.namespace) - #json here - prompt_filter = ChatPromptTemplate.from_template( - "Format and collect all outputs from the tasks presented here {tasks} and their results {results}") - chain_filter_chunk = prompt_filter | self.llm.bind(function_call={"TaskList": "tasks"}, functions=TaskList) - output = await chain_filter_chunk.ainvoke({"query": buffer_result}) - print("HERE IS THE OUTPUT", output) + class EpisodicTask(BaseModel): + """Schema for an individual task.""" + task_order: str = Field(..., description="The order at which the task needs to be performed") + task_name: str = Field(None, description="The task that needs to be performed") + operation: str = Field(None, description="The operation to be performed") + class EpisodicList(BaseModel): + """Schema for the record containing a list of tasks.""" + tasks: List[EpisodicTask] = Field(..., description="List of tasks") + start_date: str = Field(..., description="The order at which the task needs to be performed") + end_date: str = Field(..., description="The order at which the task needs to be performed") + user_query: str = Field(..., description="The order at which the task needs to be performed") - memory = Memory(user_id=self.user_id) - await memory.async_init() + parser = PydanticOutputParser(pydantic_object=EpisodicList) - lookup_value = await memory._add_episodic_memory(observation=str(output), params={}) + prompt = PromptTemplate( + template="Format the result.\n{format_instructions}\nOriginal query is: {query}\n Steps are: {steps}, buffer is: {buffer}", + input_variables=["query", "steps", "buffer"], + partial_variables={"format_instructions": parser.get_format_instructions()}, + ) + + _input = prompt.format_prompt(query=user_input, steps=str(tasks_list), buffer=buffer_result) + + return "a few things to do like load episodic memory in a structured format" + + # output = self.llm(_input.to_string()) + # + # parser.parse(output) + # memory = Memory(user_id=self.user_id) + # await memory.async_init() + # + # lookup_value = await memory._add_episodic_memory(observation=str(output), params=params) + # return lookup_value #load to buffer once is done @@ -1163,8 +1220,8 @@ class Memory: params=params ) - async def _run_buffer(self, user_input: str, content: str = None): - return await self.short_term_memory.episodic_buffer.main_buffer(user_input=user_input, content=content) + async def _run_buffer(self, user_input: str, content: str = None, params:str=None): + return await self.short_term_memory.episodic_buffer.main_buffer(user_input=user_input, content=content, params=params) async def _add_buffer_memory(self, user_input: str, namespace: str = None, params: dict = None): return await self.short_term_memory.episodic_buffer._add_memories(observation=user_input, namespace=namespace, @@ -1197,7 +1254,7 @@ async def main(): "validity_end": "2024-07-31" } - gg = await memory._run_buffer(user_input="i NEED TRANSLATION TO GERMAN ", content="i NEED TRANSLATION TO GERMAN ") + gg = await memory._run_buffer(user_input="i NEED TRANSLATION TO GERMAN ", content="i NEED TRANSLATION TO GERMAN ", params=params) print(gg) # gg = await memory._delete_buffer_memory() diff --git a/level_2/poetry.lock b/level_2/poetry.lock index cdfb4ca4a..8233ad038 100644 --- a/level_2/poetry.lock +++ b/level_2/poetry.lock @@ -1362,39 +1362,38 @@ files = [ [[package]] name = "langchain" -version = "0.0.250" +version = "0.0.271" description = "Building applications with LLMs through composability" optional = false python-versions = ">=3.8.1,<4.0" files = [ - {file = "langchain-0.0.250-py3-none-any.whl", hash = "sha256:65b3520f507e848edd88a35a70700971bbbf822fda65f621ccf44a3bb36ad03a"}, - {file = "langchain-0.0.250.tar.gz", hash = "sha256:1b5775d6a472f633bb06e794f58cb6ff5d1eeb2da603b64a6a15013f8f61ee3f"}, + {file = "langchain-0.0.271-py3-none-any.whl", hash = "sha256:3ca68c9cf04edb42ce9225adc65ee739e5e00ed55d08aeb06a47391f3c59018c"}, + {file = "langchain-0.0.271.tar.gz", hash = "sha256:f79d19405b755608216d1850de4a945a2bceb35c5ca8e4f7a4f9e29a366b097e"}, ] [package.dependencies] aiohttp = ">=3.8.3,<4.0.0" async-timeout = {version = ">=4.0.0,<5.0.0", markers = "python_version < \"3.11\""} dataclasses-json = ">=0.5.7,<0.6.0" -langsmith = ">=0.0.11,<0.1.0" +langsmith = ">=0.0.21,<0.1.0" numexpr = ">=2.8.4,<3.0.0" numpy = ">=1,<2" -openapi-schema-pydantic = ">=1.2,<2.0" -pydantic = ">=1,<2" -PyYAML = ">=5.4.1" +pydantic = ">=1,<3" +PyYAML = ">=5.3" requests = ">=2,<3" SQLAlchemy = ">=1.4,<3" tenacity = ">=8.1.0,<9.0.0" [package.extras] -all = ["O365 (>=2.0.26,<3.0.0)", "aleph-alpha-client (>=2.15.0,<3.0.0)", "amadeus (>=8.1.0)", "anthropic (>=0.3,<0.4)", "arxiv (>=1.4,<2.0)", "atlassian-python-api (>=3.36.0,<4.0.0)", "awadb (>=0.3.9,<0.4.0)", "azure-ai-formrecognizer (>=3.2.1,<4.0.0)", "azure-ai-vision (>=0.11.1b1,<0.12.0)", "azure-cognitiveservices-speech (>=1.28.0,<2.0.0)", "azure-cosmos (>=4.4.0b1,<5.0.0)", "azure-identity (>=1.12.0,<2.0.0)", "beautifulsoup4 (>=4,<5)", "clarifai (>=9.1.0)", "clickhouse-connect (>=0.5.14,<0.6.0)", "cohere (>=4,<5)", "deeplake (>=3.6.8,<4.0.0)", "docarray[hnswlib] (>=0.32.0,<0.33.0)", "duckduckgo-search (>=3.8.3,<4.0.0)", "elasticsearch (>=8,<9)", "esprima (>=4.0.1,<5.0.0)", "faiss-cpu (>=1,<2)", "google-api-python-client (==2.70.0)", "google-auth (>=2.18.1,<3.0.0)", "google-search-results (>=2,<3)", "gptcache (>=0.1.7)", "html2text (>=2020.1.16,<2021.0.0)", "huggingface_hub (>=0,<1)", "jina (>=3.14,<4.0)", "jinja2 (>=3,<4)", "jq (>=1.4.1,<2.0.0)", "lancedb (>=0.1,<0.2)", "langkit (>=0.0.6,<0.1.0)", "lark (>=1.1.5,<2.0.0)", "libdeeplake (>=0.0.60,<0.0.61)", "librosa (>=0.10.0.post2,<0.11.0)", "lxml (>=4.9.2,<5.0.0)", "manifest-ml (>=0.0.1,<0.0.2)", "marqo (>=0.11.0,<0.12.0)", "momento (>=1.5.0,<2.0.0)", "nebula3-python (>=3.4.0,<4.0.0)", "neo4j (>=5.8.1,<6.0.0)", "networkx (>=2.6.3,<3.0.0)", "nlpcloud (>=1,<2)", "nltk (>=3,<4)", "nomic (>=1.0.43,<2.0.0)", "octoai-sdk (>=0.1.1,<0.2.0)", "openai (>=0,<1)", "openlm (>=0.0.5,<0.0.6)", "opensearch-py (>=2.0.0,<3.0.0)", "pdfminer-six (>=20221105,<20221106)", "pexpect (>=4.8.0,<5.0.0)", "pgvector (>=0.1.6,<0.2.0)", "pinecone-client (>=2,<3)", "pinecone-text (>=0.4.2,<0.5.0)", "psycopg2-binary (>=2.9.5,<3.0.0)", "pymongo (>=4.3.3,<5.0.0)", "pyowm (>=3.3.0,<4.0.0)", "pypdf (>=3.4.0,<4.0.0)", "pytesseract (>=0.3.10,<0.4.0)", "python-arango (>=7.5.9,<8.0.0)", "pyvespa (>=0.33.0,<0.34.0)", "qdrant-client (>=1.3.1,<2.0.0)", "rdflib (>=6.3.2,<7.0.0)", "redis (>=4,<5)", "requests-toolbelt (>=1.0.0,<2.0.0)", "sentence-transformers (>=2,<3)", "singlestoredb (>=0.7.1,<0.8.0)", "spacy (>=3,<4)", "steamship (>=2.16.9,<3.0.0)", "tensorflow-text (>=2.11.0,<3.0.0)", "tigrisdb (>=1.0.0b6,<2.0.0)", "tiktoken (>=0.3.2,<0.4.0)", "torch (>=1,<3)", "transformers (>=4,<5)", "weaviate-client (>=3,<4)", "wikipedia (>=1,<2)", "wolframalpha (==5.0.0)", "xinference (>=0.0.6,<0.0.7)"] +all = ["O365 (>=2.0.26,<3.0.0)", "aleph-alpha-client (>=2.15.0,<3.0.0)", "amadeus (>=8.1.0)", "arxiv (>=1.4,<2.0)", "atlassian-python-api (>=3.36.0,<4.0.0)", "awadb (>=0.3.9,<0.4.0)", "azure-ai-formrecognizer (>=3.2.1,<4.0.0)", "azure-ai-vision (>=0.11.1b1,<0.12.0)", "azure-cognitiveservices-speech (>=1.28.0,<2.0.0)", "azure-cosmos (>=4.4.0b1,<5.0.0)", "azure-identity (>=1.12.0,<2.0.0)", "beautifulsoup4 (>=4,<5)", "clarifai (>=9.1.0)", "clickhouse-connect (>=0.5.14,<0.6.0)", "cohere (>=4,<5)", "deeplake (>=3.6.8,<4.0.0)", "docarray[hnswlib] (>=0.32.0,<0.33.0)", "duckduckgo-search (>=3.8.3,<4.0.0)", "elasticsearch (>=8,<9)", "esprima (>=4.0.1,<5.0.0)", "faiss-cpu (>=1,<2)", "google-api-python-client (==2.70.0)", "google-auth (>=2.18.1,<3.0.0)", "google-search-results (>=2,<3)", "gptcache (>=0.1.7)", "html2text (>=2020.1.16,<2021.0.0)", "huggingface_hub (>=0,<1)", "jinja2 (>=3,<4)", "jq (>=1.4.1,<2.0.0)", "lancedb (>=0.1,<0.2)", "langkit (>=0.0.6,<0.1.0)", "lark (>=1.1.5,<2.0.0)", "libdeeplake (>=0.0.60,<0.0.61)", "librosa (>=0.10.0.post2,<0.11.0)", "lxml (>=4.9.2,<5.0.0)", "manifest-ml (>=0.0.1,<0.0.2)", "marqo (>=1.2.4,<2.0.0)", "momento (>=1.5.0,<2.0.0)", "nebula3-python (>=3.4.0,<4.0.0)", "neo4j (>=5.8.1,<6.0.0)", "networkx (>=2.6.3,<3.0.0)", "nlpcloud (>=1,<2)", "nltk (>=3,<4)", "nomic (>=1.0.43,<2.0.0)", "openai (>=0,<1)", "openlm (>=0.0.5,<0.0.6)", "opensearch-py (>=2.0.0,<3.0.0)", "pdfminer-six (>=20221105,<20221106)", "pexpect (>=4.8.0,<5.0.0)", "pgvector (>=0.1.6,<0.2.0)", "pinecone-client (>=2,<3)", "pinecone-text (>=0.4.2,<0.5.0)", "psycopg2-binary (>=2.9.5,<3.0.0)", "pymongo (>=4.3.3,<5.0.0)", "pyowm (>=3.3.0,<4.0.0)", "pypdf (>=3.4.0,<4.0.0)", "pytesseract (>=0.3.10,<0.4.0)", "python-arango (>=7.5.9,<8.0.0)", "pyvespa (>=0.33.0,<0.34.0)", "qdrant-client (>=1.3.1,<2.0.0)", "rdflib (>=6.3.2,<7.0.0)", "redis (>=4,<5)", "requests-toolbelt (>=1.0.0,<2.0.0)", "sentence-transformers (>=2,<3)", "singlestoredb (>=0.7.1,<0.8.0)", "tensorflow-text (>=2.11.0,<3.0.0)", "tigrisdb (>=1.0.0b6,<2.0.0)", "tiktoken (>=0.3.2,<0.4.0)", "torch (>=1,<3)", "transformers (>=4,<5)", "weaviate-client (>=3,<4)", "wikipedia (>=1,<2)", "wolframalpha (==5.0.0)"] azure = ["azure-ai-formrecognizer (>=3.2.1,<4.0.0)", "azure-ai-vision (>=0.11.1b1,<0.12.0)", "azure-cognitiveservices-speech (>=1.28.0,<2.0.0)", "azure-core (>=1.26.4,<2.0.0)", "azure-cosmos (>=4.4.0b1,<5.0.0)", "azure-identity (>=1.12.0,<2.0.0)", "azure-search-documents (==11.4.0b6)", "openai (>=0,<1)"] clarifai = ["clarifai (>=9.1.0)"] cohere = ["cohere (>=4,<5)"] docarray = ["docarray[hnswlib] (>=0.32.0,<0.33.0)"] embeddings = ["sentence-transformers (>=2,<3)"] -extended-testing = ["atlassian-python-api (>=3.36.0,<4.0.0)", "beautifulsoup4 (>=4,<5)", "bibtexparser (>=1.4.0,<2.0.0)", "cassio (>=0.0.7,<0.0.8)", "chardet (>=5.1.0,<6.0.0)", "esprima (>=4.0.1,<5.0.0)", "geopandas (>=0.13.1,<0.14.0)", "gitpython (>=3.1.32,<4.0.0)", "gql (>=3.4.1,<4.0.0)", "html2text (>=2020.1.16,<2021.0.0)", "jinja2 (>=3,<4)", "jq (>=1.4.1,<2.0.0)", "lxml (>=4.9.2,<5.0.0)", "mwparserfromhell (>=0.6.4,<0.7.0)", "mwxml (>=0.3.3,<0.4.0)", "openai (>=0,<1)", "pandas (>=2.0.1,<3.0.0)", "pdfminer-six (>=20221105,<20221106)", "pgvector (>=0.1.6,<0.2.0)", "psychicapi (>=0.8.0,<0.9.0)", "py-trello (>=0.19.0,<0.20.0)", "pymupdf (>=1.22.3,<2.0.0)", "pypdf (>=3.4.0,<4.0.0)", "pypdfium2 (>=4.10.0,<5.0.0)", "pyspark (>=3.4.0,<4.0.0)", "rank-bm25 (>=0.2.2,<0.3.0)", "rapidfuzz (>=3.1.1,<4.0.0)", "requests-toolbelt (>=1.0.0,<2.0.0)", "scikit-learn (>=1.2.2,<2.0.0)", "streamlit (>=1.18.0,<2.0.0)", "sympy (>=1.12,<2.0)", "telethon (>=1.28.5,<2.0.0)", "tqdm (>=4.48.0)", "xinference (>=0.0.6,<0.0.7)", "zep-python (>=0.32)"] +extended-testing = ["amazon-textract-caller (<2)", "atlassian-python-api (>=3.36.0,<4.0.0)", "beautifulsoup4 (>=4,<5)", "bibtexparser (>=1.4.0,<2.0.0)", "cassio (>=0.0.7,<0.0.8)", "chardet (>=5.1.0,<6.0.0)", "esprima (>=4.0.1,<5.0.0)", "faiss-cpu (>=1,<2)", "feedparser (>=6.0.10,<7.0.0)", "geopandas (>=0.13.1,<0.14.0)", "gitpython (>=3.1.32,<4.0.0)", "gql (>=3.4.1,<4.0.0)", "html2text (>=2020.1.16,<2021.0.0)", "jinja2 (>=3,<4)", "jq (>=1.4.1,<2.0.0)", "lxml (>=4.9.2,<5.0.0)", "mwparserfromhell (>=0.6.4,<0.7.0)", "mwxml (>=0.3.3,<0.4.0)", "newspaper3k (>=0.2.8,<0.3.0)", "openai (>=0,<1)", "openapi-schema-pydantic (>=1.2,<2.0)", "pandas (>=2.0.1,<3.0.0)", "pdfminer-six (>=20221105,<20221106)", "pgvector (>=0.1.6,<0.2.0)", "psychicapi (>=0.8.0,<0.9.0)", "py-trello (>=0.19.0,<0.20.0)", "pymupdf (>=1.22.3,<2.0.0)", "pypdf (>=3.4.0,<4.0.0)", "pypdfium2 (>=4.10.0,<5.0.0)", "pyspark (>=3.4.0,<4.0.0)", "rank-bm25 (>=0.2.2,<0.3.0)", "rapidfuzz (>=3.1.1,<4.0.0)", "requests-toolbelt (>=1.0.0,<2.0.0)", "scikit-learn (>=1.2.2,<2.0.0)", "streamlit (>=1.18.0,<2.0.0)", "sympy (>=1.12,<2.0)", "telethon (>=1.28.5,<2.0.0)", "tqdm (>=4.48.0)", "xata (>=1.0.0a7,<2.0.0)", "xmltodict (>=0.13.0,<0.14.0)"] javascript = ["esprima (>=4.0.1,<5.0.0)"] -llms = ["anthropic (>=0.3,<0.4)", "clarifai (>=9.1.0)", "cohere (>=4,<5)", "huggingface_hub (>=0,<1)", "manifest-ml (>=0.0.1,<0.0.2)", "nlpcloud (>=1,<2)", "openai (>=0,<1)", "openllm (>=0.1.19)", "openlm (>=0.0.5,<0.0.6)", "torch (>=1,<3)", "transformers (>=4,<5)", "xinference (>=0.0.6,<0.0.7)"] +llms = ["clarifai (>=9.1.0)", "cohere (>=4,<5)", "huggingface_hub (>=0,<1)", "manifest-ml (>=0.0.1,<0.0.2)", "nlpcloud (>=1,<2)", "openai (>=0,<1)", "openlm (>=0.0.5,<0.0.6)", "torch (>=1,<3)", "transformers (>=4,<5)"] openai = ["openai (>=0,<1)", "tiktoken (>=0.3.2,<0.4.0)"] qdrant = ["qdrant-client (>=1.3.1,<2.0.0)"] text-helpers = ["chardet (>=5.1.0,<6.0.0)"] @@ -1875,20 +1874,6 @@ dev = ["black (>=21.6b0,<22.0)", "pytest (==6.*)", "pytest-asyncio", "pytest-moc embeddings = ["matplotlib", "numpy", "openpyxl (>=3.0.7)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)", "plotly", "scikit-learn (>=1.0.2)", "scipy", "tenacity (>=8.0.1)"] wandb = ["numpy", "openpyxl (>=3.0.7)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)", "wandb"] -[[package]] -name = "openapi-schema-pydantic" -version = "1.2.4" -description = "OpenAPI (v3) specification schema as pydantic class" -optional = false -python-versions = ">=3.6.1" -files = [ - {file = "openapi-schema-pydantic-1.2.4.tar.gz", hash = "sha256:3e22cf58b74a69f752cc7e5f1537f6e44164282db2700cbbcd3bb99ddd065196"}, - {file = "openapi_schema_pydantic-1.2.4-py3-none-any.whl", hash = "sha256:a932ecc5dcbb308950282088956e94dea069c9823c84e507d64f6b622222098c"}, -] - -[package.dependencies] -pydantic = ">=1.8.2" - [[package]] name = "orjson" version = "3.9.5" @@ -3795,4 +3780,4 @@ multidict = ">=4.0" [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "788ca51ba313eac5f1dbcadfd7f91109b8b7a7734a1f16e10c8fb2e3b435a606" +content-hash = "5629225437c5aec01f9f862d46d6d1e68abde4c42a0c1ad709df875883171991" diff --git a/level_2/pyproject.toml b/level_2/pyproject.toml index 461255af9..5a252ba37 100644 --- a/level_2/pyproject.toml +++ b/level_2/pyproject.toml @@ -8,7 +8,7 @@ readme = "README.md" [tool.poetry.dependencies] python = "^3.10" #langchain = {git = "https://github.com/topoteretes/langchain.git" , tag = "v0.0.209"} -langchain = "v0.0.250" +langchain = "v0.0.271" nltk = "3.8.1" openai = "0.27.8"