Merge pull request #18 from topoteretes/extend_search_and_tokenizers
Extend search and tokenizers
This commit is contained in:
commit
e119b7697f
8 changed files with 151 additions and 108 deletions
|
|
@ -5,14 +5,21 @@
|
||||||
### Description
|
### Description
|
||||||
|
|
||||||
|
|
||||||
Initial code lets you do three operations:
|
RAG test manager can be used via API (in progress) or via the CLI
|
||||||
|
|
||||||
1. Add to memory
|
Make sure to run scripts/create_database.py
|
||||||
2. Retrieve from memory
|
|
||||||
3. Structure the data to schema
|
|
||||||
4. Load to a database
|
|
||||||
|
|
||||||
#How to use
|
After that, you can run:
|
||||||
|
|
||||||
|
``` python test_runner.py \
|
||||||
|
--url "https://www.ibiblio.org/ebooks/London/Call%20of%20Wild.pdf" \
|
||||||
|
--test_set "path/to/test_set.json" \
|
||||||
|
--user_id "666" \
|
||||||
|
--metadata "path/to/metadata.json"
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
#How to start
|
||||||
|
|
||||||
## Installation
|
## Installation
|
||||||
|
|
||||||
|
|
@ -22,6 +29,24 @@ Initial code lets you do three operations:
|
||||||
|
|
||||||
```docker compose up promethai_mem ```
|
```docker compose up promethai_mem ```
|
||||||
|
|
||||||
|
``` poetry shell ```
|
||||||
|
|
||||||
|
Make sure to run
|
||||||
|
|
||||||
|
``` python scripts/create_database.py ```
|
||||||
|
|
||||||
|
After that, you can run:
|
||||||
|
|
||||||
|
``` python rag_test_manager.py \
|
||||||
|
--url "https://www.ibiblio.org/ebooks/London/Call%20of%20Wild.pdf" \
|
||||||
|
--test_set "example_data/test_set.json" \
|
||||||
|
--user_id "666" \
|
||||||
|
--metadata "example_data/metadata.json"
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
To see example of test_set.json and metadata.json, check the files in the folder "example_data"
|
||||||
|
|
||||||
|
|
||||||
## Clean database
|
## Clean database
|
||||||
|
|
||||||
|
|
@ -30,7 +55,7 @@ Initial code lets you do three operations:
|
||||||
|
|
||||||
```docker volume prune ```
|
```docker volume prune ```
|
||||||
|
|
||||||
docker compose up --force-recreate --build promethai_mem
|
``` docker compose up --force-recreate --build promethai_mem ```
|
||||||
|
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
|
||||||
13
level_3/example_data/metadata.json
Normal file
13
level_3/example_data/metadata.json
Normal file
|
|
@ -0,0 +1,13 @@
|
||||||
|
{
|
||||||
|
"version": "1.0",
|
||||||
|
"agreement_id": "AG123456",
|
||||||
|
"privacy_policy": "https://example.com/privacy",
|
||||||
|
"terms_of_service": "https://example.com/terms",
|
||||||
|
"format": "json",
|
||||||
|
"schema_version": "1.1",
|
||||||
|
"checksum": "a1b2c3d4e5f6",
|
||||||
|
"owner": "John Doe",
|
||||||
|
"license": "MIT",
|
||||||
|
"validity_start": "2023-08-01",
|
||||||
|
"validity_end": "2024-07-31"
|
||||||
|
}
|
||||||
22
level_3/example_data/test_set.json
Normal file
22
level_3/example_data/test_set.json
Normal file
|
|
@ -0,0 +1,22 @@
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"question": "Who is the main character in 'The Call of the Wild'?",
|
||||||
|
"answer": "Buck"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"question": "Who wrote 'The Call of the Wild'?",
|
||||||
|
"answer": "Jack London"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"question": "Where does Buck live at the start of the book?",
|
||||||
|
"answer": "In the Santa Clara Valley, at Judge Miller’s place."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"question": "Why is Buck kidnapped?",
|
||||||
|
"answer": "He is kidnapped to be sold as a sled dog in the Yukon during the Klondike Gold Rush."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"question": "How does Buck become the leader of the sled dog team?",
|
||||||
|
"answer": "Buck becomes the leader after defeating the original leader, Spitz, in a fight."
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
@ -1,3 +1,5 @@
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
|
|
@ -383,49 +385,87 @@ async def start_test(data, test_set=None, user_id=None, params=None, job_id=None
|
||||||
|
|
||||||
print(test_result_collection)
|
print(test_result_collection)
|
||||||
|
|
||||||
add_entity(session, TestOutput(id=test_id, user_id=user_id, content=str(test_result_collection)))
|
add_entity(session, TestOutput(id=test_id, user_id=user_id, test_results=str(test_result_collection)))
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
|
#
|
||||||
|
# params = {
|
||||||
|
# "version": "1.0",
|
||||||
|
# "agreement_id": "AG123456",
|
||||||
|
# "privacy_policy": "https://example.com/privacy",
|
||||||
|
# "terms_of_service": "https://example.com/terms",
|
||||||
|
# "format": "json",
|
||||||
|
# "schema_version": "1.1",
|
||||||
|
# "checksum": "a1b2c3d4e5f6",
|
||||||
|
# "owner": "John Doe",
|
||||||
|
# "license": "MIT",
|
||||||
|
# "validity_start": "2023-08-01",
|
||||||
|
# "validity_end": "2024-07-31",
|
||||||
|
# }
|
||||||
|
#
|
||||||
|
# test_set = [
|
||||||
|
# {
|
||||||
|
# "question": "Who is the main character in 'The Call of the Wild'?",
|
||||||
|
# "answer": "Buck"
|
||||||
|
# },
|
||||||
|
# {
|
||||||
|
# "question": "Who wrote 'The Call of the Wild'?",
|
||||||
|
# "answer": "Jack London"
|
||||||
|
# },
|
||||||
|
# {
|
||||||
|
# "question": "Where does Buck live at the start of the book?",
|
||||||
|
# "answer": "In the Santa Clara Valley, at Judge Miller’s place."
|
||||||
|
# },
|
||||||
|
# {
|
||||||
|
# "question": "Why is Buck kidnapped?",
|
||||||
|
# "answer": "He is kidnapped to be sold as a sled dog in the Yukon during the Klondike Gold Rush."
|
||||||
|
# },
|
||||||
|
# {
|
||||||
|
# "question": "How does Buck become the leader of the sled dog team?",
|
||||||
|
# "answer": "Buck becomes the leader after defeating the original leader, Spitz, in a fight."
|
||||||
|
# }
|
||||||
|
# ]
|
||||||
|
# result = await start_test("https://www.ibiblio.org/ebooks/London/Call%20of%20Wild.pdf", test_set=test_set, user_id="666", params=None, metadata=params)
|
||||||
|
#
|
||||||
|
parser = argparse.ArgumentParser(description="Run tests against a document.")
|
||||||
|
parser.add_argument("--url", required=True, help="URL of the document to test.")
|
||||||
|
parser.add_argument("--test_set", required=True, help="Path to JSON file containing the test set.")
|
||||||
|
parser.add_argument("--user_id", required=True, help="User ID.")
|
||||||
|
parser.add_argument("--params", help="Additional parameters in JSON format.")
|
||||||
|
parser.add_argument("--metadata", required=True, help="Path to JSON file containing metadata.")
|
||||||
|
|
||||||
params = {
|
args = parser.parse_args()
|
||||||
"version": "1.0",
|
|
||||||
"agreement_id": "AG123456",
|
|
||||||
"privacy_policy": "https://example.com/privacy",
|
|
||||||
"terms_of_service": "https://example.com/terms",
|
|
||||||
"format": "json",
|
|
||||||
"schema_version": "1.1",
|
|
||||||
"checksum": "a1b2c3d4e5f6",
|
|
||||||
"owner": "John Doe",
|
|
||||||
"license": "MIT",
|
|
||||||
"validity_start": "2023-08-01",
|
|
||||||
"validity_end": "2024-07-31",
|
|
||||||
}
|
|
||||||
|
|
||||||
test_set = [
|
try:
|
||||||
{
|
with open(args.test_set, "r") as file:
|
||||||
"question": "Who is the main character in 'The Call of the Wild'?",
|
test_set = json.load(file)
|
||||||
"answer": "Buck"
|
if not isinstance(test_set, list): # Expecting a list
|
||||||
},
|
raise TypeError("Parsed test_set JSON is not a list.")
|
||||||
{
|
except Exception as e:
|
||||||
"question": "Who wrote 'The Call of the Wild'?",
|
print(f"Error loading test_set: {str(e)}")
|
||||||
"answer": "Jack London"
|
return
|
||||||
},
|
|
||||||
{
|
|
||||||
"question": "Where does Buck live at the start of the book?",
|
|
||||||
"answer": "In the Santa Clara Valley, at Judge Miller’s place."
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"question": "Why is Buck kidnapped?",
|
|
||||||
"answer": "He is kidnapped to be sold as a sled dog in the Yukon during the Klondike Gold Rush."
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"question": "How does Buck become the leader of the sled dog team?",
|
|
||||||
"answer": "Buck becomes the leader after defeating the original leader, Spitz, in a fight."
|
|
||||||
}
|
|
||||||
]
|
|
||||||
result = await start_test("https://www.ibiblio.org/ebooks/London/Call%20of%20Wild.pdf", test_set=test_set, user_id="666", params=None, metadata=params)
|
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(args.metadata, "r") as file:
|
||||||
|
metadata = json.load(file)
|
||||||
|
if not isinstance(metadata, dict):
|
||||||
|
raise TypeError("Parsed metadata JSON is not a dictionary.")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error loading metadata: {str(e)}")
|
||||||
|
return
|
||||||
|
|
||||||
|
if args.params:
|
||||||
|
try:
|
||||||
|
params = json.loads(args.params)
|
||||||
|
if not isinstance(params, dict):
|
||||||
|
raise TypeError("Parsed params JSON is not a dictionary.")
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
print(f"Error parsing params: {str(e)}")
|
||||||
|
return
|
||||||
|
else:
|
||||||
|
params = None
|
||||||
|
|
||||||
|
await start_test(args.url, test_set, args.user_id, params, metadata)
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
import asyncio
|
import asyncio
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,7 @@
|
||||||
from langchain.document_loaders import PyPDFLoader
|
from langchain.document_loaders import PyPDFLoader
|
||||||
import sys, os
|
import sys, os
|
||||||
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
||||||
from level_3.shared.chunk_strategy import ChunkStrategy
|
from shared.chunk_strategy import ChunkStrategy
|
||||||
import re
|
import re
|
||||||
def chunk_data(chunk_strategy=None, source_data=None, chunk_size=None, chunk_overlap=None):
|
def chunk_data(chunk_strategy=None, source_data=None, chunk_size=None, chunk_overlap=None):
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,8 +1,10 @@
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
import fitz
|
import fitz
|
||||||
# sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
import os
|
||||||
|
import sys
|
||||||
|
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
||||||
|
|
||||||
from level_3.vectordb.chunkers.chunkers import chunk_data
|
from vectordb.chunkers.chunkers import chunk_data
|
||||||
from llama_hub.file.base import SimpleDirectoryReader
|
from llama_hub.file.base import SimpleDirectoryReader
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
|
|
|
||||||
|
|
@ -248,9 +248,9 @@ class WeaviateVectorDB(VectorDB):
|
||||||
return client.batch.delete_objects(
|
return client.batch.delete_objects(
|
||||||
class_name=namespace,
|
class_name=namespace,
|
||||||
where={
|
where={
|
||||||
"path": ["user_id"],
|
"path": ["version"],
|
||||||
"operator": "Equal",
|
"operator": "Equal",
|
||||||
"valueText": self.user_id,
|
"valueText": "1.0",
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -345,62 +345,3 @@ if __name__ == "__main__":
|
||||||
|
|
||||||
asyncio.run(main())
|
asyncio.run(main())
|
||||||
|
|
||||||
# Check for existing user
|
|
||||||
# existing_user = session.query(User).filter_by(id=user_id).first()
|
|
||||||
#
|
|
||||||
# if existing_user:
|
|
||||||
# self.memory_id = existing_user.memory_id
|
|
||||||
# existing_memories_classes = session.query(Memory).filter_by(id=user_id).first()
|
|
||||||
# self.memory_instances = []
|
|
||||||
#
|
|
||||||
# for memory in existing_memories_classes:
|
|
||||||
# instance = DynamicBaseMemory(memory, user_id, self.memory_id, index_name, db_type, namespace)
|
|
||||||
# self.memory_instances.append(instance)
|
|
||||||
# else:
|
|
||||||
# self.memory_id = str(uuid.uuid4())
|
|
||||||
# new_user = User(id=user_id, memory_id=self.memory_id) # Adjust as per your User model
|
|
||||||
# session.add(new_user)
|
|
||||||
# session.commit()
|
|
||||||
# memory_classes = ['Memory', 'SemanticMemory', 'EpisodicMemory']
|
|
||||||
# self.memory_instances = []
|
|
||||||
#
|
|
||||||
# for memory in memory_classes:
|
|
||||||
# instance = DynamicBaseMemory(memory, user_id, self.memory_id, index_name, db_type, namespace)
|
|
||||||
# self.memory_instances.append(instance)
|
|
||||||
# # fix this so it uploads relationships between memories
|
|
||||||
# session.add(Memory(id=self.memory_id, user_id=user_id))
|
|
||||||
# session.commit()
|
|
||||||
#
|
|
||||||
# if existing_user:
|
|
||||||
# attributes_list = session.query(Memory.attributes_list).filter_by(id=self.memory_id).scalar()
|
|
||||||
# attributes_list = ast.literal_eval(attributes_list)
|
|
||||||
# for attr in attributes_list:
|
|
||||||
# self.memory_class.add_attribute(attr)
|
|
||||||
# methods_list = session.query(Memory.methods_list).filter_by(id=self.memory_id).scalar()
|
|
||||||
# methods_list = ast.literal_eval(methods_list)
|
|
||||||
# for class_instance in self.memory_instances:
|
|
||||||
# # , self.episodic_buffer_class]:
|
|
||||||
# for method in methods_list:
|
|
||||||
# class_instance.add_method(method)
|
|
||||||
# else:
|
|
||||||
# attributes_list = ['user_id', 'index_name', 'db_type', 'knowledge_source', 'knowledge_type', 'memory_id',
|
|
||||||
# 'long_term_memory', 'short_term_memory', 'namespace']
|
|
||||||
# for attr in attributes_list:
|
|
||||||
# self.memory_class.add_attribute(attr)
|
|
||||||
# # if old user, fetch attributes from memory table and load them like above
|
|
||||||
# # if new user, load methods from a list
|
|
||||||
# methods_list = ['async_create_long_term_memory', 'async_init', 'add_memories', "fetch_memories",
|
|
||||||
# 'async_create_short_term_memory',
|
|
||||||
# '_create_buffer_context', '_get_task_list', '_run_main_buffer',
|
|
||||||
# '_available_operations', '_provide_feedback']
|
|
||||||
# session.add(Memory(id=self.memory_id, user_id=user_id, methods_list=str(methods_list),
|
|
||||||
# attributes_list=str(attributes_list)))
|
|
||||||
# session.commit()
|
|
||||||
# # if old user, load methods from db
|
|
||||||
# # if new user, use class inherintance like bellow
|
|
||||||
# for class_instance in self.memory_instances:
|
|
||||||
# # , self.episodic_buffer_class]:
|
|
||||||
# for method in methods_list:
|
|
||||||
# class_instance.add_method(method)
|
|
||||||
|
|
||||||
# # Safely convert string representation to a list
|
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue