Merge pull request #18 from topoteretes/extend_search_and_tokenizers

Extend search and tokenizers
2023-10-09 21:13:24 +02:00 · 2023-10-09 21:13:24 +02:00 · e119b7697f
commit e119b7697f
parent 89678869ce 30678539e7
8 changed files with 151 additions and 108 deletions
--- a/level_3/Readme.md
+++ b/level_3/Readme.md
@ -5,14 +5,21 @@
 ### Description


-Initial code lets you do three operations:
+RAG test manager can be used via API (in progress) or via the CLI

-1. Add to memory
-2. Retrieve from memory
-3. Structure the data to schema 
-4. Load to a database
+Make sure to run scripts/create_database.py

-#How to use
+After that, you can run: 
+
+``` python test_runner.py \
+    --url "https://www.ibiblio.org/ebooks/London/Call%20of%20Wild.pdf" \
+    --test_set "path/to/test_set.json" \
+    --user_id "666" \
+    --metadata "path/to/metadata.json" 
+```
+
+
+#How to start 

 ## Installation

@ -22,6 +29,24 @@ Initial code lets you do three operations:

 ```docker compose up promethai_mem   ```

+``` poetry shell ```
+
+Make sure to run 
+
+``` python scripts/create_database.py ```
+
+After that, you can run: 
+
+``` python rag_test_manager.py \
+    --url "https://www.ibiblio.org/ebooks/London/Call%20of%20Wild.pdf" \
+    --test_set "example_data/test_set.json" \
+    --user_id "666" \
+    --metadata "example_data/metadata.json"
+
+```
+
+To see example of test_set.json and metadata.json, check the files in the folder "example_data"
+

 ## Clean database

@ -30,7 +55,7 @@ Initial code lets you do three operations:

 ```docker volume prune  ```

-docker compose up --force-recreate --build promethai_mem
+``` docker compose up --force-recreate --build promethai_mem ```


 ## Usage
--- a/level_3/example_data/metadata.json
+++ b/level_3/example_data/metadata.json
@ -0,0 +1,13 @@
+{
+        "version": "1.0",
+        "agreement_id": "AG123456",
+        "privacy_policy": "https://example.com/privacy",
+        "terms_of_service": "https://example.com/terms",
+        "format": "json",
+        "schema_version": "1.1",
+        "checksum": "a1b2c3d4e5f6",
+        "owner": "John Doe",
+        "license": "MIT",
+        "validity_start": "2023-08-01",
+        "validity_end": "2024-07-31"
+    }
--- a/level_3/example_data/test_set.json
+++ b/level_3/example_data/test_set.json
@ -0,0 +1,22 @@
+  [
+        {
+            "question": "Who is the main character in 'The Call of the Wild'?",
+            "answer": "Buck"
+        },
+        {
+            "question": "Who wrote 'The Call of the Wild'?",
+            "answer": "Jack London"
+        },
+        {
+            "question": "Where does Buck live at the start of the book?",
+            "answer": "In the Santa Clara Valley, at Judge Miller’s place."
+        },
+        {
+            "question": "Why is Buck kidnapped?",
+            "answer": "He is kidnapped to be sold as a sled dog in the Yukon during the Klondike Gold Rush."
+        },
+        {
+            "question": "How does Buck become the leader of the sled dog team?",
+            "answer": "Buck becomes the leader after defeating the original leader, Spitz, in a fight."
+        }
+    ]
--- a/level_3/rag_test_manager.py
+++ b/level_3/rag_test_manager.py
@ -1,3 +1,5 @@
+import argparse
+import json
 from enum import Enum
 import sys
 import os
@ -383,49 +385,87 @@ async def start_test(data, test_set=None, user_id=None, params=None, job_id=None

        print(test_result_collection)

-        add_entity(session, TestOutput(id=test_id, user_id=user_id, content=str(test_result_collection)))
+        add_entity(session, TestOutput(id=test_id, user_id=user_id, test_results=str(test_result_collection)))

 async def main():
+    #
+    # params = {
+    #     "version": "1.0",
+    #     "agreement_id": "AG123456",
+    #     "privacy_policy": "https://example.com/privacy",
+    #     "terms_of_service": "https://example.com/terms",
+    #     "format": "json",
+    #     "schema_version": "1.1",
+    #     "checksum": "a1b2c3d4e5f6",
+    #     "owner": "John Doe",
+    #     "license": "MIT",
+    #     "validity_start": "2023-08-01",
+    #     "validity_end": "2024-07-31",
+    # }
+    #
+    # test_set = [
+    #     {
+    #         "question": "Who is the main character in 'The Call of the Wild'?",
+    #         "answer": "Buck"
+    #     },
+    #     {
+    #         "question": "Who wrote 'The Call of the Wild'?",
+    #         "answer": "Jack London"
+    #     },
+    #     {
+    #         "question": "Where does Buck live at the start of the book?",
+    #         "answer": "In the Santa Clara Valley, at Judge Miller’s place."
+    #     },
+    #     {
+    #         "question": "Why is Buck kidnapped?",
+    #         "answer": "He is kidnapped to be sold as a sled dog in the Yukon during the Klondike Gold Rush."
+    #     },
+    #     {
+    #         "question": "How does Buck become the leader of the sled dog team?",
+    #         "answer": "Buck becomes the leader after defeating the original leader, Spitz, in a fight."
+    #     }
+    # ]
+    # result = await start_test("https://www.ibiblio.org/ebooks/London/Call%20of%20Wild.pdf", test_set=test_set, user_id="666", params=None, metadata=params)
+    #
+    parser = argparse.ArgumentParser(description="Run tests against a document.")
+    parser.add_argument("--url", required=True, help="URL of the document to test.")
+    parser.add_argument("--test_set", required=True, help="Path to JSON file containing the test set.")
+    parser.add_argument("--user_id", required=True, help="User ID.")
+    parser.add_argument("--params", help="Additional parameters in JSON format.")
+    parser.add_argument("--metadata", required=True, help="Path to JSON file containing metadata.")

-    params = {
-        "version": "1.0",
-        "agreement_id": "AG123456",
-        "privacy_policy": "https://example.com/privacy",
-        "terms_of_service": "https://example.com/terms",
-        "format": "json",
-        "schema_version": "1.1",
-        "checksum": "a1b2c3d4e5f6",
-        "owner": "John Doe",
-        "license": "MIT",
-        "validity_start": "2023-08-01",
-        "validity_end": "2024-07-31",
-    }
+    args = parser.parse_args()

-    test_set = [
-        {
-            "question": "Who is the main character in 'The Call of the Wild'?",
-            "answer": "Buck"
-        },
-        {
-            "question": "Who wrote 'The Call of the Wild'?",
-            "answer": "Jack London"
-        },
-        {
-            "question": "Where does Buck live at the start of the book?",
-            "answer": "In the Santa Clara Valley, at Judge Miller’s place."
-        },
-        {
-            "question": "Why is Buck kidnapped?",
-            "answer": "He is kidnapped to be sold as a sled dog in the Yukon during the Klondike Gold Rush."
-        },
-        {
-            "question": "How does Buck become the leader of the sled dog team?",
-            "answer": "Buck becomes the leader after defeating the original leader, Spitz, in a fight."
-        }
-    ]
-    result = await start_test("https://www.ibiblio.org/ebooks/London/Call%20of%20Wild.pdf", test_set=test_set, user_id="666", params=None, metadata=params)
+    try:
+        with open(args.test_set, "r") as file:
+            test_set = json.load(file)
+            if not isinstance(test_set, list):  # Expecting a list
+                raise TypeError("Parsed test_set JSON is not a list.")
+    except Exception as e:
+        print(f"Error loading test_set: {str(e)}")
+        return

+    try:
+        with open(args.metadata, "r") as file:
+            metadata = json.load(file)
+            if not isinstance(metadata, dict):
+                raise TypeError("Parsed metadata JSON is not a dictionary.")
+    except Exception as e:
+        print(f"Error loading metadata: {str(e)}")
+        return

+    if args.params:
+        try:
+            params = json.loads(args.params)
+            if not isinstance(params, dict):
+                raise TypeError("Parsed params JSON is not a dictionary.")
+        except json.JSONDecodeError as e:
+            print(f"Error parsing params: {str(e)}")
+            return
+    else:
+        params = None
+
+    await start_test(args.url, test_set, args.user_id, params, metadata)
 if __name__ == "__main__":
    import asyncio

--- a/level_3/vectordb/chunkers/chunkers.py
+++ b/level_3/vectordb/chunkers/chunkers.py
@ -1,7 +1,7 @@
 from langchain.document_loaders import PyPDFLoader
 import sys, os
 sys.path.append(os.path.dirname(os.path.abspath(__file__)))
-from level_3.shared.chunk_strategy import ChunkStrategy
+from shared.chunk_strategy import ChunkStrategy
 import re
 def chunk_data(chunk_strategy=None, source_data=None, chunk_size=None, chunk_overlap=None):

--- a/level_3/vectordb/loaders/loaders.py
+++ b/level_3/vectordb/loaders/loaders.py
@ -1,8 +1,10 @@
 from io import BytesIO
 import fitz
-# sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+import os
+import sys
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))

-from level_3.vectordb.chunkers.chunkers import chunk_data
+from vectordb.chunkers.chunkers import chunk_data
 from llama_hub.file.base import SimpleDirectoryReader

 import requests
--- a/level_3/vectordb/vectordb.py
+++ b/level_3/vectordb/vectordb.py
@ -248,9 +248,9 @@ class WeaviateVectorDB(VectorDB):
            return client.batch.delete_objects(
                class_name=namespace,
                where={
-                    "path": ["user_id"],
+                    "path": ["version"],
                    "operator": "Equal",
-                    "valueText": self.user_id,
+                    "valueText": "1.0",
                },
            )

--- a/level_3/vectorstore_manager.py
+++ b/level_3/vectorstore_manager.py
@ -345,62 +345,3 @@ if __name__ == "__main__":

    asyncio.run(main())

-# Check for existing user
-# existing_user = session.query(User).filter_by(id=user_id).first()
-#
-# if existing_user:
-#     self.memory_id = existing_user.memory_id
-#     existing_memories_classes = session.query(Memory).filter_by(id=user_id).first()
-#     self.memory_instances = []
-#
-#     for memory in existing_memories_classes:
-#         instance = DynamicBaseMemory(memory, user_id, self.memory_id, index_name, db_type, namespace)
-#         self.memory_instances.append(instance)
-# else:
-#     self.memory_id = str(uuid.uuid4())
-#     new_user = User(id=user_id, memory_id=self.memory_id)  # Adjust as per your User model
-#     session.add(new_user)
-#     session.commit()
-#     memory_classes = ['Memory', 'SemanticMemory', 'EpisodicMemory']
-#     self.memory_instances = []
-#
-#     for memory in memory_classes:
-#         instance = DynamicBaseMemory(memory, user_id, self.memory_id, index_name, db_type, namespace)
-#         self.memory_instances.append(instance)
-#     # fix this so it uploads relationships between memories
-#     session.add(Memory(id=self.memory_id, user_id=user_id))
-#     session.commit()
-#
-# if existing_user:
-#     attributes_list = session.query(Memory.attributes_list).filter_by(id=self.memory_id).scalar()
-#     attributes_list = ast.literal_eval(attributes_list)
-#     for attr in attributes_list:
-#         self.memory_class.add_attribute(attr)
-#     methods_list = session.query(Memory.methods_list).filter_by(id=self.memory_id).scalar()
-#     methods_list = ast.literal_eval(methods_list)
-#     for class_instance in self.memory_instances:
-#         # , self.episodic_buffer_class]:
-#         for method in methods_list:
-#             class_instance.add_method(method)
-# else:
-#     attributes_list = ['user_id', 'index_name', 'db_type', 'knowledge_source', 'knowledge_type', 'memory_id',
-#                        'long_term_memory', 'short_term_memory', 'namespace']
-#     for attr in attributes_list:
-#         self.memory_class.add_attribute(attr)
-#     # if old user, fetch attributes from memory table and load them like above
-#     # if new user, load methods from a list
-#     methods_list = ['async_create_long_term_memory', 'async_init', 'add_memories', "fetch_memories",
-#                     'async_create_short_term_memory',
-#                     '_create_buffer_context', '_get_task_list', '_run_main_buffer',
-#                     '_available_operations', '_provide_feedback']
-#     session.add(Memory(id=self.memory_id, user_id=user_id, methods_list=str(methods_list),
-#                        attributes_list=str(attributes_list)))
-#     session.commit()
-#     # if old user, load methods from db
-#     # if new user, use class inherintance like bellow
-#     for class_instance in self.memory_instances:
-#         # , self.episodic_buffer_class]:
-#         for method in methods_list:
-#             class_instance.add_method(method)
-
-# # Safely convert string representation to a list