Merge pull request #18 from topoteretes/extend_search_and_tokenizers

Extend search and tokenizers
This commit is contained in:
Vasilije 2023-10-09 21:13:24 +02:00 committed by GitHub
commit e119b7697f
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
8 changed files with 151 additions and 108 deletions

View file

@ -5,14 +5,21 @@
### Description
Initial code lets you do three operations:
RAG test manager can be used via API (in progress) or via the CLI
1. Add to memory
2. Retrieve from memory
3. Structure the data to schema
4. Load to a database
Make sure to run scripts/create_database.py
#How to use
After that, you can run:
``` python test_runner.py \
--url "https://www.ibiblio.org/ebooks/London/Call%20of%20Wild.pdf" \
--test_set "path/to/test_set.json" \
--user_id "666" \
--metadata "path/to/metadata.json"
```
#How to start
## Installation
@ -22,6 +29,24 @@ Initial code lets you do three operations:
```docker compose up promethai_mem ```
``` poetry shell ```
Make sure to run
``` python scripts/create_database.py ```
After that, you can run:
``` python rag_test_manager.py \
--url "https://www.ibiblio.org/ebooks/London/Call%20of%20Wild.pdf" \
--test_set "example_data/test_set.json" \
--user_id "666" \
--metadata "example_data/metadata.json"
```
To see example of test_set.json and metadata.json, check the files in the folder "example_data"
## Clean database
@ -30,7 +55,7 @@ Initial code lets you do three operations:
```docker volume prune ```
docker compose up --force-recreate --build promethai_mem
``` docker compose up --force-recreate --build promethai_mem ```
## Usage

View file

@ -0,0 +1,13 @@
{
"version": "1.0",
"agreement_id": "AG123456",
"privacy_policy": "https://example.com/privacy",
"terms_of_service": "https://example.com/terms",
"format": "json",
"schema_version": "1.1",
"checksum": "a1b2c3d4e5f6",
"owner": "John Doe",
"license": "MIT",
"validity_start": "2023-08-01",
"validity_end": "2024-07-31"
}

View file

@ -0,0 +1,22 @@
[
{
"question": "Who is the main character in 'The Call of the Wild'?",
"answer": "Buck"
},
{
"question": "Who wrote 'The Call of the Wild'?",
"answer": "Jack London"
},
{
"question": "Where does Buck live at the start of the book?",
"answer": "In the Santa Clara Valley, at Judge Millers place."
},
{
"question": "Why is Buck kidnapped?",
"answer": "He is kidnapped to be sold as a sled dog in the Yukon during the Klondike Gold Rush."
},
{
"question": "How does Buck become the leader of the sled dog team?",
"answer": "Buck becomes the leader after defeating the original leader, Spitz, in a fight."
}
]

View file

@ -1,3 +1,5 @@
import argparse
import json
from enum import Enum
import sys
import os
@ -383,49 +385,87 @@ async def start_test(data, test_set=None, user_id=None, params=None, job_id=None
print(test_result_collection)
add_entity(session, TestOutput(id=test_id, user_id=user_id, content=str(test_result_collection)))
add_entity(session, TestOutput(id=test_id, user_id=user_id, test_results=str(test_result_collection)))
async def main():
#
# params = {
# "version": "1.0",
# "agreement_id": "AG123456",
# "privacy_policy": "https://example.com/privacy",
# "terms_of_service": "https://example.com/terms",
# "format": "json",
# "schema_version": "1.1",
# "checksum": "a1b2c3d4e5f6",
# "owner": "John Doe",
# "license": "MIT",
# "validity_start": "2023-08-01",
# "validity_end": "2024-07-31",
# }
#
# test_set = [
# {
# "question": "Who is the main character in 'The Call of the Wild'?",
# "answer": "Buck"
# },
# {
# "question": "Who wrote 'The Call of the Wild'?",
# "answer": "Jack London"
# },
# {
# "question": "Where does Buck live at the start of the book?",
# "answer": "In the Santa Clara Valley, at Judge Millers place."
# },
# {
# "question": "Why is Buck kidnapped?",
# "answer": "He is kidnapped to be sold as a sled dog in the Yukon during the Klondike Gold Rush."
# },
# {
# "question": "How does Buck become the leader of the sled dog team?",
# "answer": "Buck becomes the leader after defeating the original leader, Spitz, in a fight."
# }
# ]
# result = await start_test("https://www.ibiblio.org/ebooks/London/Call%20of%20Wild.pdf", test_set=test_set, user_id="666", params=None, metadata=params)
#
parser = argparse.ArgumentParser(description="Run tests against a document.")
parser.add_argument("--url", required=True, help="URL of the document to test.")
parser.add_argument("--test_set", required=True, help="Path to JSON file containing the test set.")
parser.add_argument("--user_id", required=True, help="User ID.")
parser.add_argument("--params", help="Additional parameters in JSON format.")
parser.add_argument("--metadata", required=True, help="Path to JSON file containing metadata.")
params = {
"version": "1.0",
"agreement_id": "AG123456",
"privacy_policy": "https://example.com/privacy",
"terms_of_service": "https://example.com/terms",
"format": "json",
"schema_version": "1.1",
"checksum": "a1b2c3d4e5f6",
"owner": "John Doe",
"license": "MIT",
"validity_start": "2023-08-01",
"validity_end": "2024-07-31",
}
args = parser.parse_args()
test_set = [
{
"question": "Who is the main character in 'The Call of the Wild'?",
"answer": "Buck"
},
{
"question": "Who wrote 'The Call of the Wild'?",
"answer": "Jack London"
},
{
"question": "Where does Buck live at the start of the book?",
"answer": "In the Santa Clara Valley, at Judge Millers place."
},
{
"question": "Why is Buck kidnapped?",
"answer": "He is kidnapped to be sold as a sled dog in the Yukon during the Klondike Gold Rush."
},
{
"question": "How does Buck become the leader of the sled dog team?",
"answer": "Buck becomes the leader after defeating the original leader, Spitz, in a fight."
}
]
result = await start_test("https://www.ibiblio.org/ebooks/London/Call%20of%20Wild.pdf", test_set=test_set, user_id="666", params=None, metadata=params)
try:
with open(args.test_set, "r") as file:
test_set = json.load(file)
if not isinstance(test_set, list): # Expecting a list
raise TypeError("Parsed test_set JSON is not a list.")
except Exception as e:
print(f"Error loading test_set: {str(e)}")
return
try:
with open(args.metadata, "r") as file:
metadata = json.load(file)
if not isinstance(metadata, dict):
raise TypeError("Parsed metadata JSON is not a dictionary.")
except Exception as e:
print(f"Error loading metadata: {str(e)}")
return
if args.params:
try:
params = json.loads(args.params)
if not isinstance(params, dict):
raise TypeError("Parsed params JSON is not a dictionary.")
except json.JSONDecodeError as e:
print(f"Error parsing params: {str(e)}")
return
else:
params = None
await start_test(args.url, test_set, args.user_id, params, metadata)
if __name__ == "__main__":
import asyncio

View file

@ -1,7 +1,7 @@
from langchain.document_loaders import PyPDFLoader
import sys, os
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
from level_3.shared.chunk_strategy import ChunkStrategy
from shared.chunk_strategy import ChunkStrategy
import re
def chunk_data(chunk_strategy=None, source_data=None, chunk_size=None, chunk_overlap=None):

View file

@ -1,8 +1,10 @@
from io import BytesIO
import fitz
# sys.path.append(os.path.dirname(os.path.abspath(__file__)))
import os
import sys
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
from level_3.vectordb.chunkers.chunkers import chunk_data
from vectordb.chunkers.chunkers import chunk_data
from llama_hub.file.base import SimpleDirectoryReader
import requests

View file

@ -248,9 +248,9 @@ class WeaviateVectorDB(VectorDB):
return client.batch.delete_objects(
class_name=namespace,
where={
"path": ["user_id"],
"path": ["version"],
"operator": "Equal",
"valueText": self.user_id,
"valueText": "1.0",
},
)

View file

@ -345,62 +345,3 @@ if __name__ == "__main__":
asyncio.run(main())
# Check for existing user
# existing_user = session.query(User).filter_by(id=user_id).first()
#
# if existing_user:
# self.memory_id = existing_user.memory_id
# existing_memories_classes = session.query(Memory).filter_by(id=user_id).first()
# self.memory_instances = []
#
# for memory in existing_memories_classes:
# instance = DynamicBaseMemory(memory, user_id, self.memory_id, index_name, db_type, namespace)
# self.memory_instances.append(instance)
# else:
# self.memory_id = str(uuid.uuid4())
# new_user = User(id=user_id, memory_id=self.memory_id) # Adjust as per your User model
# session.add(new_user)
# session.commit()
# memory_classes = ['Memory', 'SemanticMemory', 'EpisodicMemory']
# self.memory_instances = []
#
# for memory in memory_classes:
# instance = DynamicBaseMemory(memory, user_id, self.memory_id, index_name, db_type, namespace)
# self.memory_instances.append(instance)
# # fix this so it uploads relationships between memories
# session.add(Memory(id=self.memory_id, user_id=user_id))
# session.commit()
#
# if existing_user:
# attributes_list = session.query(Memory.attributes_list).filter_by(id=self.memory_id).scalar()
# attributes_list = ast.literal_eval(attributes_list)
# for attr in attributes_list:
# self.memory_class.add_attribute(attr)
# methods_list = session.query(Memory.methods_list).filter_by(id=self.memory_id).scalar()
# methods_list = ast.literal_eval(methods_list)
# for class_instance in self.memory_instances:
# # , self.episodic_buffer_class]:
# for method in methods_list:
# class_instance.add_method(method)
# else:
# attributes_list = ['user_id', 'index_name', 'db_type', 'knowledge_source', 'knowledge_type', 'memory_id',
# 'long_term_memory', 'short_term_memory', 'namespace']
# for attr in attributes_list:
# self.memory_class.add_attribute(attr)
# # if old user, fetch attributes from memory table and load them like above
# # if new user, load methods from a list
# methods_list = ['async_create_long_term_memory', 'async_init', 'add_memories', "fetch_memories",
# 'async_create_short_term_memory',
# '_create_buffer_context', '_get_task_list', '_run_main_buffer',
# '_available_operations', '_provide_feedback']
# session.add(Memory(id=self.memory_id, user_id=user_id, methods_list=str(methods_list),
# attributes_list=str(attributes_list)))
# session.commit()
# # if old user, load methods from db
# # if new user, use class inherintance like bellow
# for class_instance in self.memory_instances:
# # , self.episodic_buffer_class]:
# for method in methods_list:
# class_instance.add_method(method)
# # Safely convert string representation to a list