Merge pull request #18 from topoteretes/extend_search_and_tokenizers

Extend search and tokenizers
This commit is contained in:
Vasilije 2023-10-09 21:13:24 +02:00 committed by GitHub
commit e119b7697f
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
8 changed files with 151 additions and 108 deletions

View file

@ -5,14 +5,21 @@
### Description ### Description
Initial code lets you do three operations: RAG test manager can be used via API (in progress) or via the CLI
1. Add to memory Make sure to run scripts/create_database.py
2. Retrieve from memory
3. Structure the data to schema
4. Load to a database
#How to use After that, you can run:
``` python test_runner.py \
--url "https://www.ibiblio.org/ebooks/London/Call%20of%20Wild.pdf" \
--test_set "path/to/test_set.json" \
--user_id "666" \
--metadata "path/to/metadata.json"
```
#How to start
## Installation ## Installation
@ -22,6 +29,24 @@ Initial code lets you do three operations:
```docker compose up promethai_mem ``` ```docker compose up promethai_mem ```
``` poetry shell ```
Make sure to run
``` python scripts/create_database.py ```
After that, you can run:
``` python rag_test_manager.py \
--url "https://www.ibiblio.org/ebooks/London/Call%20of%20Wild.pdf" \
--test_set "example_data/test_set.json" \
--user_id "666" \
--metadata "example_data/metadata.json"
```
To see example of test_set.json and metadata.json, check the files in the folder "example_data"
## Clean database ## Clean database
@ -30,7 +55,7 @@ Initial code lets you do three operations:
```docker volume prune ``` ```docker volume prune ```
docker compose up --force-recreate --build promethai_mem ``` docker compose up --force-recreate --build promethai_mem ```
## Usage ## Usage

View file

@ -0,0 +1,13 @@
{
"version": "1.0",
"agreement_id": "AG123456",
"privacy_policy": "https://example.com/privacy",
"terms_of_service": "https://example.com/terms",
"format": "json",
"schema_version": "1.1",
"checksum": "a1b2c3d4e5f6",
"owner": "John Doe",
"license": "MIT",
"validity_start": "2023-08-01",
"validity_end": "2024-07-31"
}

View file

@ -0,0 +1,22 @@
[
{
"question": "Who is the main character in 'The Call of the Wild'?",
"answer": "Buck"
},
{
"question": "Who wrote 'The Call of the Wild'?",
"answer": "Jack London"
},
{
"question": "Where does Buck live at the start of the book?",
"answer": "In the Santa Clara Valley, at Judge Millers place."
},
{
"question": "Why is Buck kidnapped?",
"answer": "He is kidnapped to be sold as a sled dog in the Yukon during the Klondike Gold Rush."
},
{
"question": "How does Buck become the leader of the sled dog team?",
"answer": "Buck becomes the leader after defeating the original leader, Spitz, in a fight."
}
]

View file

@ -1,3 +1,5 @@
import argparse
import json
from enum import Enum from enum import Enum
import sys import sys
import os import os
@ -383,49 +385,87 @@ async def start_test(data, test_set=None, user_id=None, params=None, job_id=None
print(test_result_collection) print(test_result_collection)
add_entity(session, TestOutput(id=test_id, user_id=user_id, content=str(test_result_collection))) add_entity(session, TestOutput(id=test_id, user_id=user_id, test_results=str(test_result_collection)))
async def main(): async def main():
#
# params = {
# "version": "1.0",
# "agreement_id": "AG123456",
# "privacy_policy": "https://example.com/privacy",
# "terms_of_service": "https://example.com/terms",
# "format": "json",
# "schema_version": "1.1",
# "checksum": "a1b2c3d4e5f6",
# "owner": "John Doe",
# "license": "MIT",
# "validity_start": "2023-08-01",
# "validity_end": "2024-07-31",
# }
#
# test_set = [
# {
# "question": "Who is the main character in 'The Call of the Wild'?",
# "answer": "Buck"
# },
# {
# "question": "Who wrote 'The Call of the Wild'?",
# "answer": "Jack London"
# },
# {
# "question": "Where does Buck live at the start of the book?",
# "answer": "In the Santa Clara Valley, at Judge Millers place."
# },
# {
# "question": "Why is Buck kidnapped?",
# "answer": "He is kidnapped to be sold as a sled dog in the Yukon during the Klondike Gold Rush."
# },
# {
# "question": "How does Buck become the leader of the sled dog team?",
# "answer": "Buck becomes the leader after defeating the original leader, Spitz, in a fight."
# }
# ]
# result = await start_test("https://www.ibiblio.org/ebooks/London/Call%20of%20Wild.pdf", test_set=test_set, user_id="666", params=None, metadata=params)
#
parser = argparse.ArgumentParser(description="Run tests against a document.")
parser.add_argument("--url", required=True, help="URL of the document to test.")
parser.add_argument("--test_set", required=True, help="Path to JSON file containing the test set.")
parser.add_argument("--user_id", required=True, help="User ID.")
parser.add_argument("--params", help="Additional parameters in JSON format.")
parser.add_argument("--metadata", required=True, help="Path to JSON file containing metadata.")
params = { args = parser.parse_args()
"version": "1.0",
"agreement_id": "AG123456",
"privacy_policy": "https://example.com/privacy",
"terms_of_service": "https://example.com/terms",
"format": "json",
"schema_version": "1.1",
"checksum": "a1b2c3d4e5f6",
"owner": "John Doe",
"license": "MIT",
"validity_start": "2023-08-01",
"validity_end": "2024-07-31",
}
test_set = [ try:
{ with open(args.test_set, "r") as file:
"question": "Who is the main character in 'The Call of the Wild'?", test_set = json.load(file)
"answer": "Buck" if not isinstance(test_set, list): # Expecting a list
}, raise TypeError("Parsed test_set JSON is not a list.")
{ except Exception as e:
"question": "Who wrote 'The Call of the Wild'?", print(f"Error loading test_set: {str(e)}")
"answer": "Jack London" return
},
{
"question": "Where does Buck live at the start of the book?",
"answer": "In the Santa Clara Valley, at Judge Millers place."
},
{
"question": "Why is Buck kidnapped?",
"answer": "He is kidnapped to be sold as a sled dog in the Yukon during the Klondike Gold Rush."
},
{
"question": "How does Buck become the leader of the sled dog team?",
"answer": "Buck becomes the leader after defeating the original leader, Spitz, in a fight."
}
]
result = await start_test("https://www.ibiblio.org/ebooks/London/Call%20of%20Wild.pdf", test_set=test_set, user_id="666", params=None, metadata=params)
try:
with open(args.metadata, "r") as file:
metadata = json.load(file)
if not isinstance(metadata, dict):
raise TypeError("Parsed metadata JSON is not a dictionary.")
except Exception as e:
print(f"Error loading metadata: {str(e)}")
return
if args.params:
try:
params = json.loads(args.params)
if not isinstance(params, dict):
raise TypeError("Parsed params JSON is not a dictionary.")
except json.JSONDecodeError as e:
print(f"Error parsing params: {str(e)}")
return
else:
params = None
await start_test(args.url, test_set, args.user_id, params, metadata)
if __name__ == "__main__": if __name__ == "__main__":
import asyncio import asyncio

View file

@ -1,7 +1,7 @@
from langchain.document_loaders import PyPDFLoader from langchain.document_loaders import PyPDFLoader
import sys, os import sys, os
sys.path.append(os.path.dirname(os.path.abspath(__file__))) sys.path.append(os.path.dirname(os.path.abspath(__file__)))
from level_3.shared.chunk_strategy import ChunkStrategy from shared.chunk_strategy import ChunkStrategy
import re import re
def chunk_data(chunk_strategy=None, source_data=None, chunk_size=None, chunk_overlap=None): def chunk_data(chunk_strategy=None, source_data=None, chunk_size=None, chunk_overlap=None):

View file

@ -1,8 +1,10 @@
from io import BytesIO from io import BytesIO
import fitz import fitz
# sys.path.append(os.path.dirname(os.path.abspath(__file__))) import os
import sys
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
from level_3.vectordb.chunkers.chunkers import chunk_data from vectordb.chunkers.chunkers import chunk_data
from llama_hub.file.base import SimpleDirectoryReader from llama_hub.file.base import SimpleDirectoryReader
import requests import requests

View file

@ -248,9 +248,9 @@ class WeaviateVectorDB(VectorDB):
return client.batch.delete_objects( return client.batch.delete_objects(
class_name=namespace, class_name=namespace,
where={ where={
"path": ["user_id"], "path": ["version"],
"operator": "Equal", "operator": "Equal",
"valueText": self.user_id, "valueText": "1.0",
}, },
) )

View file

@ -345,62 +345,3 @@ if __name__ == "__main__":
asyncio.run(main()) asyncio.run(main())
# Check for existing user
# existing_user = session.query(User).filter_by(id=user_id).first()
#
# if existing_user:
# self.memory_id = existing_user.memory_id
# existing_memories_classes = session.query(Memory).filter_by(id=user_id).first()
# self.memory_instances = []
#
# for memory in existing_memories_classes:
# instance = DynamicBaseMemory(memory, user_id, self.memory_id, index_name, db_type, namespace)
# self.memory_instances.append(instance)
# else:
# self.memory_id = str(uuid.uuid4())
# new_user = User(id=user_id, memory_id=self.memory_id) # Adjust as per your User model
# session.add(new_user)
# session.commit()
# memory_classes = ['Memory', 'SemanticMemory', 'EpisodicMemory']
# self.memory_instances = []
#
# for memory in memory_classes:
# instance = DynamicBaseMemory(memory, user_id, self.memory_id, index_name, db_type, namespace)
# self.memory_instances.append(instance)
# # fix this so it uploads relationships between memories
# session.add(Memory(id=self.memory_id, user_id=user_id))
# session.commit()
#
# if existing_user:
# attributes_list = session.query(Memory.attributes_list).filter_by(id=self.memory_id).scalar()
# attributes_list = ast.literal_eval(attributes_list)
# for attr in attributes_list:
# self.memory_class.add_attribute(attr)
# methods_list = session.query(Memory.methods_list).filter_by(id=self.memory_id).scalar()
# methods_list = ast.literal_eval(methods_list)
# for class_instance in self.memory_instances:
# # , self.episodic_buffer_class]:
# for method in methods_list:
# class_instance.add_method(method)
# else:
# attributes_list = ['user_id', 'index_name', 'db_type', 'knowledge_source', 'knowledge_type', 'memory_id',
# 'long_term_memory', 'short_term_memory', 'namespace']
# for attr in attributes_list:
# self.memory_class.add_attribute(attr)
# # if old user, fetch attributes from memory table and load them like above
# # if new user, load methods from a list
# methods_list = ['async_create_long_term_memory', 'async_init', 'add_memories', "fetch_memories",
# 'async_create_short_term_memory',
# '_create_buffer_context', '_get_task_list', '_run_main_buffer',
# '_available_operations', '_provide_feedback']
# session.add(Memory(id=self.memory_id, user_id=user_id, methods_list=str(methods_list),
# attributes_list=str(attributes_list)))
# session.commit()
# # if old user, load methods from db
# # if new user, use class inherintance like bellow
# for class_instance in self.memory_instances:
# # , self.episodic_buffer_class]:
# for method in methods_list:
# class_instance.add_method(method)
# # Safely convert string representation to a list