Merge pull request #18 from topoteretes/extend_search_and_tokenizers

Extend search and tokenizers
2023-10-09 21:13:24 +02:00 · 2023-10-09 21:13:24 +02:00 · e119b7697f
commit e119b7697f
parent 89678869ce 30678539e7
8 changed files with 151 additions and 108 deletions
--- a/level_3/Readme.md
+++ b/level_3/Readme.md
@ -5,14 +5,21 @@
 ### Description
-Initial code lets you do three operations:
+RAG test manager can be used via API (in progress) or via the CLI
-1. Add to memory
+Make sure to run scripts/create_database.py
 2. Retrieve from memory
 3. Structure the data to schema 
 4. Load to a database
-#How to use
+After that, you can run: 
 ``` python test_runner.py \
    --url "https://www.ibiblio.org/ebooks/London/Call%20of%20Wild.pdf" \
    --test_set "path/to/test_set.json" \
    --user_id "666" \
    --metadata "path/to/metadata.json" 
 ```
 #How to start 
 ## Installation
@ -22,6 +29,24 @@ Initial code lets you do three operations:
 ```docker compose up promethai_mem   ```
 ``` poetry shell ```
 Make sure to run 
 ``` python scripts/create_database.py ```
 After that, you can run: 
 ``` python rag_test_manager.py \
    --url "https://www.ibiblio.org/ebooks/London/Call%20of%20Wild.pdf" \
    --test_set "example_data/test_set.json" \
    --user_id "666" \
    --metadata "example_data/metadata.json"
 ```
 To see example of test_set.json and metadata.json, check the files in the folder "example_data"
 ## Clean database
@ -30,7 +55,7 @@ Initial code lets you do three operations:
 ```docker volume prune  ```
-docker compose up --force-recreate --build promethai_mem
+``` docker compose up --force-recreate --build promethai_mem ```
 ## Usage
--- a/level_3/example_data/metadata.json
+++ b/level_3/example_data/metadata.json
@ -0,0 +1,13 @@
 {
        "version": "1.0",
        "agreement_id": "AG123456",
        "privacy_policy": "https://example.com/privacy",
        "terms_of_service": "https://example.com/terms",
        "format": "json",
        "schema_version": "1.1",
        "checksum": "a1b2c3d4e5f6",
        "owner": "John Doe",
        "license": "MIT",
        "validity_start": "2023-08-01",
        "validity_end": "2024-07-31"
    }
--- a/level_3/example_data/test_set.json
+++ b/level_3/example_data/test_set.json
@ -0,0 +1,22 @@
  [
        {
            "question": "Who is the main character in 'The Call of the Wild'?",
            "answer": "Buck"
        },
        {
            "question": "Who wrote 'The Call of the Wild'?",
            "answer": "Jack London"
        },
        {
            "question": "Where does Buck live at the start of the book?",
            "answer": "In the Santa Clara Valley, at Judge Miller’s place."
        },
        {
            "question": "Why is Buck kidnapped?",
            "answer": "He is kidnapped to be sold as a sled dog in the Yukon during the Klondike Gold Rush."
        },
        {
            "question": "How does Buck become the leader of the sled dog team?",
            "answer": "Buck becomes the leader after defeating the original leader, Spitz, in a fight."
        }
    ]
--- a/level_3/rag_test_manager.py
+++ b/level_3/rag_test_manager.py
@ -1,3 +1,5 @@
 import argparse
 import json
 from enum import Enum
 import sys
 import os
@ -383,49 +385,87 @@ async def start_test(data, test_set=None, user_id=None, params=None, job_id=None
        print(test_result_collection)
-        add_entity(session, TestOutput(id=test_id, user_id=user_id, content=str(test_result_collection)))
+        add_entity(session, TestOutput(id=test_id, user_id=user_id, test_results=str(test_result_collection)))
 async def main():
    #
    # params = {
    #     "version": "1.0",
    #     "agreement_id": "AG123456",
    #     "privacy_policy": "https://example.com/privacy",
    #     "terms_of_service": "https://example.com/terms",
    #     "format": "json",
    #     "schema_version": "1.1",
    #     "checksum": "a1b2c3d4e5f6",
    #     "owner": "John Doe",
    #     "license": "MIT",
    #     "validity_start": "2023-08-01",
    #     "validity_end": "2024-07-31",
    # }
    #
    # test_set = [
    #     {
    #         "question": "Who is the main character in 'The Call of the Wild'?",
    #         "answer": "Buck"
    #     },
    #     {
    #         "question": "Who wrote 'The Call of the Wild'?",
    #         "answer": "Jack London"
    #     },
    #     {
    #         "question": "Where does Buck live at the start of the book?",
    #         "answer": "In the Santa Clara Valley, at Judge Miller’s place."
    #     },
    #     {
    #         "question": "Why is Buck kidnapped?",
    #         "answer": "He is kidnapped to be sold as a sled dog in the Yukon during the Klondike Gold Rush."
    #     },
    #     {
    #         "question": "How does Buck become the leader of the sled dog team?",
    #         "answer": "Buck becomes the leader after defeating the original leader, Spitz, in a fight."
    #     }
    # ]
    # result = await start_test("https://www.ibiblio.org/ebooks/London/Call%20of%20Wild.pdf", test_set=test_set, user_id="666", params=None, metadata=params)
    #
    parser = argparse.ArgumentParser(description="Run tests against a document.")
    parser.add_argument("--url", required=True, help="URL of the document to test.")
    parser.add_argument("--test_set", required=True, help="Path to JSON file containing the test set.")
    parser.add_argument("--user_id", required=True, help="User ID.")
    parser.add_argument("--params", help="Additional parameters in JSON format.")
    parser.add_argument("--metadata", required=True, help="Path to JSON file containing metadata.")
-    params = {
+    args = parser.parse_args()
        "version": "1.0",
        "agreement_id": "AG123456",
        "privacy_policy": "https://example.com/privacy",
        "terms_of_service": "https://example.com/terms",
        "format": "json",
        "schema_version": "1.1",
        "checksum": "a1b2c3d4e5f6",
        "owner": "John Doe",
        "license": "MIT",
        "validity_start": "2023-08-01",
        "validity_end": "2024-07-31",
    }
-    test_set = [
+    try:
-        {
+        with open(args.test_set, "r") as file:
-            "question": "Who is the main character in 'The Call of the Wild'?",
+            test_set = json.load(file)
-            "answer": "Buck"
+            if not isinstance(test_set, list):  # Expecting a list
-        },
+                raise TypeError("Parsed test_set JSON is not a list.")
-        {
+    except Exception as e:
-            "question": "Who wrote 'The Call of the Wild'?",
+        print(f"Error loading test_set: {str(e)}")
-            "answer": "Jack London"
+        return
        },
        {
            "question": "Where does Buck live at the start of the book?",
            "answer": "In the Santa Clara Valley, at Judge Miller’s place."
        },
        {
            "question": "Why is Buck kidnapped?",
            "answer": "He is kidnapped to be sold as a sled dog in the Yukon during the Klondike Gold Rush."
        },
        {
            "question": "How does Buck become the leader of the sled dog team?",
            "answer": "Buck becomes the leader after defeating the original leader, Spitz, in a fight."
        }
    ]
    result = await start_test("https://www.ibiblio.org/ebooks/London/Call%20of%20Wild.pdf", test_set=test_set, user_id="666", params=None, metadata=params)
    try:
        with open(args.metadata, "r") as file:
            metadata = json.load(file)
            if not isinstance(metadata, dict):
                raise TypeError("Parsed metadata JSON is not a dictionary.")
    except Exception as e:
        print(f"Error loading metadata: {str(e)}")
        return
    if args.params:
        try:
            params = json.loads(args.params)
            if not isinstance(params, dict):
                raise TypeError("Parsed params JSON is not a dictionary.")
        except json.JSONDecodeError as e:
            print(f"Error parsing params: {str(e)}")
            return
    else:
        params = None
    await start_test(args.url, test_set, args.user_id, params, metadata)
 if __name__ == "__main__":
    import asyncio
--- a/level_3/vectordb/chunkers/chunkers.py
+++ b/level_3/vectordb/chunkers/chunkers.py
@ -1,7 +1,7 @@
 from langchain.document_loaders import PyPDFLoader
 import sys, os
 sys.path.append(os.path.dirname(os.path.abspath(__file__)))
-from level_3.shared.chunk_strategy import ChunkStrategy
+from shared.chunk_strategy import ChunkStrategy
 import re
 def chunk_data(chunk_strategy=None, source_data=None, chunk_size=None, chunk_overlap=None):
--- a/level_3/vectordb/loaders/loaders.py
+++ b/level_3/vectordb/loaders/loaders.py
@ -1,8 +1,10 @@
 from io import BytesIO
 import fitz
-# sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+import os
 import sys
 sys.path.append(os.path.dirname(os.path.abspath(__file__)))
-from level_3.vectordb.chunkers.chunkers import chunk_data
+from vectordb.chunkers.chunkers import chunk_data
 from llama_hub.file.base import SimpleDirectoryReader
 import requests
--- a/level_3/vectordb/vectordb.py
+++ b/level_3/vectordb/vectordb.py
@ -248,9 +248,9 @@ class WeaviateVectorDB(VectorDB):
            return client.batch.delete_objects(
                class_name=namespace,
                where={
-                    "path": ["user_id"],
+                    "path": ["version"],
                    "operator": "Equal",
-                    "valueText": self.user_id,
+                    "valueText": "1.0",
                },
            )
--- a/level_3/vectorstore_manager.py
+++ b/level_3/vectorstore_manager.py
@ -345,62 +345,3 @@ if __name__ == "__main__":
    asyncio.run(main())
 # Check for existing user
 # existing_user = session.query(User).filter_by(id=user_id).first()
 #
 # if existing_user:
 #     self.memory_id = existing_user.memory_id
 #     existing_memories_classes = session.query(Memory).filter_by(id=user_id).first()
 #     self.memory_instances = []
 #
 #     for memory in existing_memories_classes:
 #         instance = DynamicBaseMemory(memory, user_id, self.memory_id, index_name, db_type, namespace)
 #         self.memory_instances.append(instance)
 # else:
 #     self.memory_id = str(uuid.uuid4())
 #     new_user = User(id=user_id, memory_id=self.memory_id)  # Adjust as per your User model
 #     session.add(new_user)
 #     session.commit()
 #     memory_classes = ['Memory', 'SemanticMemory', 'EpisodicMemory']
 #     self.memory_instances = []
 #
 #     for memory in memory_classes:
 #         instance = DynamicBaseMemory(memory, user_id, self.memory_id, index_name, db_type, namespace)
 #         self.memory_instances.append(instance)
 #     # fix this so it uploads relationships between memories
 #     session.add(Memory(id=self.memory_id, user_id=user_id))
 #     session.commit()
 #
 # if existing_user:
 #     attributes_list = session.query(Memory.attributes_list).filter_by(id=self.memory_id).scalar()
 #     attributes_list = ast.literal_eval(attributes_list)
 #     for attr in attributes_list:
 #         self.memory_class.add_attribute(attr)
 #     methods_list = session.query(Memory.methods_list).filter_by(id=self.memory_id).scalar()
 #     methods_list = ast.literal_eval(methods_list)
 #     for class_instance in self.memory_instances:
 #         # , self.episodic_buffer_class]:
 #         for method in methods_list:
 #             class_instance.add_method(method)
 # else:
 #     attributes_list = ['user_id', 'index_name', 'db_type', 'knowledge_source', 'knowledge_type', 'memory_id',
 #                        'long_term_memory', 'short_term_memory', 'namespace']
 #     for attr in attributes_list:
 #         self.memory_class.add_attribute(attr)
 #     # if old user, fetch attributes from memory table and load them like above
 #     # if new user, load methods from a list
 #     methods_list = ['async_create_long_term_memory', 'async_init', 'add_memories', "fetch_memories",
 #                     'async_create_short_term_memory',
 #                     '_create_buffer_context', '_get_task_list', '_run_main_buffer',
 #                     '_available_operations', '_provide_feedback']
 #     session.add(Memory(id=self.memory_id, user_id=user_id, methods_list=str(methods_list),
 #                        attributes_list=str(attributes_list)))
 #     session.commit()
 #     # if old user, load methods from db
 #     # if new user, use class inherintance like bellow
 #     for class_instance in self.memory_instances:
 #         # , self.episodic_buffer_class]:
 #         for method in methods_list:
 #             class_instance.add_method(method)
 # # Safely convert string representation to a list