From ba43c3751162bc4743c1d46c218b0da8f4c2b9ab Mon Sep 17 00:00:00 2001 From: Vasilije <8619304+Vasilije1990@users.noreply.github.com> Date: Mon, 9 Oct 2023 20:35:37 +0200 Subject: [PATCH 1/3] Added embeddings as an option, added different types of text search --- level_3/Readme.md | 10 +++--- level_3/rag_test_manager.py | 2 +- level_3/vectordb/vectordb.py | 4 +-- level_3/vectorstore_manager.py | 59 ---------------------------------- 4 files changed, 8 insertions(+), 67 deletions(-) diff --git a/level_3/Readme.md b/level_3/Readme.md index 1c8c61ed6..c7f4eadae 100644 --- a/level_3/Readme.md +++ b/level_3/Readme.md @@ -5,12 +5,12 @@ ### Description -Initial code lets you do three operations: +RAG test manager can be used via API (inprogress) or via the CLI + +Make sure to run scripts/create_database.py + + -1. Add to memory -2. Retrieve from memory -3. Structure the data to schema -4. Load to a database #How to use diff --git a/level_3/rag_test_manager.py b/level_3/rag_test_manager.py index f30dffb11..410c8c8d4 100644 --- a/level_3/rag_test_manager.py +++ b/level_3/rag_test_manager.py @@ -383,7 +383,7 @@ async def start_test(data, test_set=None, user_id=None, params=None, job_id=None print(test_result_collection) - add_entity(session, TestOutput(id=test_id, user_id=user_id, content=str(test_result_collection))) + add_entity(session, TestOutput(id=test_id, user_id=user_id, test_results=str(test_result_collection))) async def main(): diff --git a/level_3/vectordb/vectordb.py b/level_3/vectordb/vectordb.py index 493de1c5e..80ed441b4 100644 --- a/level_3/vectordb/vectordb.py +++ b/level_3/vectordb/vectordb.py @@ -248,9 +248,9 @@ class WeaviateVectorDB(VectorDB): return client.batch.delete_objects( class_name=namespace, where={ - "path": ["user_id"], + "path": ["version"], "operator": "Equal", - "valueText": self.user_id, + "valueText": "1.0", }, ) diff --git a/level_3/vectorstore_manager.py b/level_3/vectorstore_manager.py index 9a7bb7c18..e001fc383 100644 --- a/level_3/vectorstore_manager.py +++ b/level_3/vectorstore_manager.py @@ -345,62 +345,3 @@ if __name__ == "__main__": asyncio.run(main()) -# Check for existing user -# existing_user = session.query(User).filter_by(id=user_id).first() -# -# if existing_user: -# self.memory_id = existing_user.memory_id -# existing_memories_classes = session.query(Memory).filter_by(id=user_id).first() -# self.memory_instances = [] -# -# for memory in existing_memories_classes: -# instance = DynamicBaseMemory(memory, user_id, self.memory_id, index_name, db_type, namespace) -# self.memory_instances.append(instance) -# else: -# self.memory_id = str(uuid.uuid4()) -# new_user = User(id=user_id, memory_id=self.memory_id) # Adjust as per your User model -# session.add(new_user) -# session.commit() -# memory_classes = ['Memory', 'SemanticMemory', 'EpisodicMemory'] -# self.memory_instances = [] -# -# for memory in memory_classes: -# instance = DynamicBaseMemory(memory, user_id, self.memory_id, index_name, db_type, namespace) -# self.memory_instances.append(instance) -# # fix this so it uploads relationships between memories -# session.add(Memory(id=self.memory_id, user_id=user_id)) -# session.commit() -# -# if existing_user: -# attributes_list = session.query(Memory.attributes_list).filter_by(id=self.memory_id).scalar() -# attributes_list = ast.literal_eval(attributes_list) -# for attr in attributes_list: -# self.memory_class.add_attribute(attr) -# methods_list = session.query(Memory.methods_list).filter_by(id=self.memory_id).scalar() -# methods_list = ast.literal_eval(methods_list) -# for class_instance in self.memory_instances: -# # , self.episodic_buffer_class]: -# for method in methods_list: -# class_instance.add_method(method) -# else: -# attributes_list = ['user_id', 'index_name', 'db_type', 'knowledge_source', 'knowledge_type', 'memory_id', -# 'long_term_memory', 'short_term_memory', 'namespace'] -# for attr in attributes_list: -# self.memory_class.add_attribute(attr) -# # if old user, fetch attributes from memory table and load them like above -# # if new user, load methods from a list -# methods_list = ['async_create_long_term_memory', 'async_init', 'add_memories', "fetch_memories", -# 'async_create_short_term_memory', -# '_create_buffer_context', '_get_task_list', '_run_main_buffer', -# '_available_operations', '_provide_feedback'] -# session.add(Memory(id=self.memory_id, user_id=user_id, methods_list=str(methods_list), -# attributes_list=str(attributes_list))) -# session.commit() -# # if old user, load methods from db -# # if new user, use class inherintance like bellow -# for class_instance in self.memory_instances: -# # , self.episodic_buffer_class]: -# for method in methods_list: -# class_instance.add_method(method) - -# # Safely convert string representation to a list From d2e0a29d429df28bcb576e599bf2120ac4b02fd1 Mon Sep 17 00:00:00 2001 From: Vasilije <8619304+Vasilije1990@users.noreply.github.com> Date: Mon, 9 Oct 2023 20:44:25 +0200 Subject: [PATCH 2/3] Added embeddings as an option, added different types of text search --- level_3/Readme.md | 31 ++++++++-- level_3/example_data/metadata.json | 13 +++++ level_3/example_data/test_set.json | 22 +++++++ level_3/rag_test_manager.py | 94 ++++++++++++++++++------------ 4 files changed, 120 insertions(+), 40 deletions(-) create mode 100644 level_3/example_data/metadata.json create mode 100644 level_3/example_data/test_set.json diff --git a/level_3/Readme.md b/level_3/Readme.md index c7f4eadae..3e13744a5 100644 --- a/level_3/Readme.md +++ b/level_3/Readme.md @@ -5,14 +5,21 @@ ### Description -RAG test manager can be used via API (inprogress) or via the CLI +RAG test manager can be used via API (in progress) or via the CLI Make sure to run scripts/create_database.py +After that, you can run: + +``` python test_runner.py \ + --url "https://www.ibiblio.org/ebooks/London/Call%20of%20Wild.pdf" \ + --test_set "path/to/test_set.json" \ + --user_id "666" \ + --metadata "path/to/metadata.json" +``` - -#How to use +#How to start ## Installation @@ -22,6 +29,22 @@ Make sure to run scripts/create_database.py ```docker compose up promethai_mem ``` +Make sure to run + +``` python scripts/create_database.py ``` + +After that, you can run: + +``` python test_runner.py \ + --url "https://www.ibiblio.org/ebooks/London/Call%20of%20Wild.pdf" \ + --test_set "example_data/test_set.json" \ + --user_id "666" \ + --metadata "example_data/metadata.json" + +``` + +To see example of test_set.json and metadata.json, check the files in the folder "example_data" + ## Clean database @@ -30,7 +53,7 @@ Make sure to run scripts/create_database.py ```docker volume prune ``` -docker compose up --force-recreate --build promethai_mem +``` docker compose up --force-recreate --build promethai_mem ``` ## Usage diff --git a/level_3/example_data/metadata.json b/level_3/example_data/metadata.json new file mode 100644 index 000000000..1ad43755e --- /dev/null +++ b/level_3/example_data/metadata.json @@ -0,0 +1,13 @@ +{ + "version": "1.0", + "agreement_id": "AG123456", + "privacy_policy": "https://example.com/privacy", + "terms_of_service": "https://example.com/terms", + "format": "json", + "schema_version": "1.1", + "checksum": "a1b2c3d4e5f6", + "owner": "John Doe", + "license": "MIT", + "validity_start": "2023-08-01", + "validity_end": "2024-07-31", + } \ No newline at end of file diff --git a/level_3/example_data/test_set.json b/level_3/example_data/test_set.json new file mode 100644 index 000000000..caf0e2e9f --- /dev/null +++ b/level_3/example_data/test_set.json @@ -0,0 +1,22 @@ + [ + { + "question": "Who is the main character in 'The Call of the Wild'?", + "answer": "Buck" + }, + { + "question": "Who wrote 'The Call of the Wild'?", + "answer": "Jack London" + }, + { + "question": "Where does Buck live at the start of the book?", + "answer": "In the Santa Clara Valley, at Judge Miller’s place." + }, + { + "question": "Why is Buck kidnapped?", + "answer": "He is kidnapped to be sold as a sled dog in the Yukon during the Klondike Gold Rush." + }, + { + "question": "How does Buck become the leader of the sled dog team?", + "answer": "Buck becomes the leader after defeating the original leader, Spitz, in a fight." + } + ] \ No newline at end of file diff --git a/level_3/rag_test_manager.py b/level_3/rag_test_manager.py index 410c8c8d4..168aba073 100644 --- a/level_3/rag_test_manager.py +++ b/level_3/rag_test_manager.py @@ -1,3 +1,5 @@ +import argparse +import json from enum import Enum import sys import os @@ -386,46 +388,66 @@ async def start_test(data, test_set=None, user_id=None, params=None, job_id=None add_entity(session, TestOutput(id=test_id, user_id=user_id, test_results=str(test_result_collection))) async def main(): + # + # params = { + # "version": "1.0", + # "agreement_id": "AG123456", + # "privacy_policy": "https://example.com/privacy", + # "terms_of_service": "https://example.com/terms", + # "format": "json", + # "schema_version": "1.1", + # "checksum": "a1b2c3d4e5f6", + # "owner": "John Doe", + # "license": "MIT", + # "validity_start": "2023-08-01", + # "validity_end": "2024-07-31", + # } + # + # test_set = [ + # { + # "question": "Who is the main character in 'The Call of the Wild'?", + # "answer": "Buck" + # }, + # { + # "question": "Who wrote 'The Call of the Wild'?", + # "answer": "Jack London" + # }, + # { + # "question": "Where does Buck live at the start of the book?", + # "answer": "In the Santa Clara Valley, at Judge Miller’s place." + # }, + # { + # "question": "Why is Buck kidnapped?", + # "answer": "He is kidnapped to be sold as a sled dog in the Yukon during the Klondike Gold Rush." + # }, + # { + # "question": "How does Buck become the leader of the sled dog team?", + # "answer": "Buck becomes the leader after defeating the original leader, Spitz, in a fight." + # } + # ] + # result = await start_test("https://www.ibiblio.org/ebooks/London/Call%20of%20Wild.pdf", test_set=test_set, user_id="666", params=None, metadata=params) + # + parser = argparse.ArgumentParser(description="Run tests against a document.") + parser.add_argument("--url", required=True, help="URL of the document to test.") + parser.add_argument("--test_set", required=True, help="Path to JSON file containing the test set.") + parser.add_argument("--user_id", required=True, help="User ID.") + parser.add_argument("--params", help="Additional parameters in JSON format.") + parser.add_argument("--metadata", required=True, help="Path to JSON file containing metadata.") - params = { - "version": "1.0", - "agreement_id": "AG123456", - "privacy_policy": "https://example.com/privacy", - "terms_of_service": "https://example.com/terms", - "format": "json", - "schema_version": "1.1", - "checksum": "a1b2c3d4e5f6", - "owner": "John Doe", - "license": "MIT", - "validity_start": "2023-08-01", - "validity_end": "2024-07-31", - } + args = parser.parse_args() - test_set = [ - { - "question": "Who is the main character in 'The Call of the Wild'?", - "answer": "Buck" - }, - { - "question": "Who wrote 'The Call of the Wild'?", - "answer": "Jack London" - }, - { - "question": "Where does Buck live at the start of the book?", - "answer": "In the Santa Clara Valley, at Judge Miller’s place." - }, - { - "question": "Why is Buck kidnapped?", - "answer": "He is kidnapped to be sold as a sled dog in the Yukon during the Klondike Gold Rush." - }, - { - "question": "How does Buck become the leader of the sled dog team?", - "answer": "Buck becomes the leader after defeating the original leader, Spitz, in a fight." - } - ] - result = await start_test("https://www.ibiblio.org/ebooks/London/Call%20of%20Wild.pdf", test_set=test_set, user_id="666", params=None, metadata=params) + with open(args.test_set, "r") as file: + test_set = json.load(file) + with open(args.metadata, "r") as file: + metadata = json.load(file) + if args.params: + params = json.loads(args.params) + else: + params = None + + await start_test(args.url, test_set, args.user_id, params, metadata) if __name__ == "__main__": import asyncio From 30678539e7c9337937a95308b93b49805c528365 Mon Sep 17 00:00:00 2001 From: Vasilije <8619304+Vasilije1990@users.noreply.github.com> Date: Mon, 9 Oct 2023 21:12:54 +0200 Subject: [PATCH 3/3] Fixes and added command line tool to run RAG --- level_3/Readme.md | 4 +++- level_3/example_data/metadata.json | 2 +- level_3/rag_test_manager.py | 28 ++++++++++++++++++++++----- level_3/vectordb/chunkers/chunkers.py | 2 +- level_3/vectordb/loaders/loaders.py | 6 ++++-- 5 files changed, 32 insertions(+), 10 deletions(-) diff --git a/level_3/Readme.md b/level_3/Readme.md index 3e13744a5..3d66b64f1 100644 --- a/level_3/Readme.md +++ b/level_3/Readme.md @@ -29,13 +29,15 @@ After that, you can run: ```docker compose up promethai_mem ``` +``` poetry shell ``` + Make sure to run ``` python scripts/create_database.py ``` After that, you can run: -``` python test_runner.py \ +``` python rag_test_manager.py \ --url "https://www.ibiblio.org/ebooks/London/Call%20of%20Wild.pdf" \ --test_set "example_data/test_set.json" \ --user_id "666" \ diff --git a/level_3/example_data/metadata.json b/level_3/example_data/metadata.json index 1ad43755e..8df9036cc 100644 --- a/level_3/example_data/metadata.json +++ b/level_3/example_data/metadata.json @@ -9,5 +9,5 @@ "owner": "John Doe", "license": "MIT", "validity_start": "2023-08-01", - "validity_end": "2024-07-31", + "validity_end": "2024-07-31" } \ No newline at end of file diff --git a/level_3/rag_test_manager.py b/level_3/rag_test_manager.py index 168aba073..204b5f918 100644 --- a/level_3/rag_test_manager.py +++ b/level_3/rag_test_manager.py @@ -436,14 +436,32 @@ async def main(): args = parser.parse_args() - with open(args.test_set, "r") as file: - test_set = json.load(file) + try: + with open(args.test_set, "r") as file: + test_set = json.load(file) + if not isinstance(test_set, list): # Expecting a list + raise TypeError("Parsed test_set JSON is not a list.") + except Exception as e: + print(f"Error loading test_set: {str(e)}") + return - with open(args.metadata, "r") as file: - metadata = json.load(file) + try: + with open(args.metadata, "r") as file: + metadata = json.load(file) + if not isinstance(metadata, dict): + raise TypeError("Parsed metadata JSON is not a dictionary.") + except Exception as e: + print(f"Error loading metadata: {str(e)}") + return if args.params: - params = json.loads(args.params) + try: + params = json.loads(args.params) + if not isinstance(params, dict): + raise TypeError("Parsed params JSON is not a dictionary.") + except json.JSONDecodeError as e: + print(f"Error parsing params: {str(e)}") + return else: params = None diff --git a/level_3/vectordb/chunkers/chunkers.py b/level_3/vectordb/chunkers/chunkers.py index a02433a77..3019b2478 100644 --- a/level_3/vectordb/chunkers/chunkers.py +++ b/level_3/vectordb/chunkers/chunkers.py @@ -1,7 +1,7 @@ from langchain.document_loaders import PyPDFLoader import sys, os sys.path.append(os.path.dirname(os.path.abspath(__file__))) -from level_3.shared.chunk_strategy import ChunkStrategy +from shared.chunk_strategy import ChunkStrategy import re def chunk_data(chunk_strategy=None, source_data=None, chunk_size=None, chunk_overlap=None): diff --git a/level_3/vectordb/loaders/loaders.py b/level_3/vectordb/loaders/loaders.py index 9ecf2e40a..ead2605be 100644 --- a/level_3/vectordb/loaders/loaders.py +++ b/level_3/vectordb/loaders/loaders.py @@ -1,8 +1,10 @@ from io import BytesIO import fitz -# sys.path.append(os.path.dirname(os.path.abspath(__file__))) +import os +import sys +sys.path.append(os.path.dirname(os.path.abspath(__file__))) -from level_3.vectordb.chunkers.chunkers import chunk_data +from vectordb.chunkers.chunkers import chunk_data from llama_hub.file.base import SimpleDirectoryReader import requests