Update docs, fix issue with params, add tracking

This commit is contained in:
Vasilije 2023-10-31 00:29:45 +01:00
parent 3409d5bf61
commit e7e5d9831e
11 changed files with 130 additions and 62 deletions

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View file

@ -13,7 +13,7 @@ Send the request to the API:
``` ```
curl -X POST -H "Content-Type: application/json" -d '{ curl -X POST -H "Content-Type: application/json" -d '{
"payload": { "payload": {
"user_id": "681", "user_id": "97980cfea0067",
"data": [".data/3ZCCCW.pdf"], "data": [".data/3ZCCCW.pdf"],
"test_set": "sample", "test_set": "sample",
"params": ["chunk_size"], "params": ["chunk_size"],
@ -81,7 +81,7 @@ After that, you can run the RAG test manager from your command line.
python rag_test_manager.py \ python rag_test_manager.py \
--file ".data" \ --file ".data" \
--test_set "example_data/test_set.json" \ --test_set "example_data/test_set.json" \
--user_id "666" \ --user_id "97980cfea0067" \
--params "chunk_size" "search_type" \ --params "chunk_size" "search_type" \
--metadata "example_data/metadata.json" \ --metadata "example_data/metadata.json" \
--retriever_type "single_document_context" --retriever_type "single_document_context"
@ -89,3 +89,21 @@ After that, you can run the RAG test manager from your command line.
``` ```
Examples of metadata structure and test set are in the folder "example_data" Examples of metadata structure and test set are in the folder "example_data"
python rag_test_manager.py \
--file ".data" \
--test_set "example_data/test_set.json" \
--user_id "97980cfea0067" \
--params "chunk_size" "search_type" \
--metadata "example_data/metadata.json" \
--retriever_type "llm_context"
python rag_test_manager.py \
--file ".data" \
--test_set "example_data/test_set.json" \
--user_id "97980cfea0068" \
--params "chunk_size" "search_type", "overlap" \
--metadata "example_data/metadata.json" \
--retriever_type "single_document_context"

View file

@ -51,23 +51,25 @@ services:
ports: ports:
- "5432:5432" - "5432:5432"
# superset: superset:
# platform: linux/amd64 platform: linux/amd64
# build: build:
# context: ./superset context: ./superset
# dockerfile: Dockerfile dockerfile: Dockerfile
# container_name: superset container_name: superset
# environment: environment:
# - ADMIN_USERNAME=admin - ADMIN_USERNAME=admin
# - ADMIN_EMAIL=vasilije@topoteretes.com - ADMIN_EMAIL=vasilije@topoteretes.com
# - ADMIN_PASSWORD=admin - ADMIN_PASSWORD=admin
# - POSTGRES_USER=bla - POSTGRES_USER=bla
# - POSTGRES_PASSWORD=bla - POSTGRES_PASSWORD=bla
# - POSTGRES_DB=bubu - POSTGRES_DB=bubu
# networks: networks:
# - promethai_mem_backend - promethai_mem_backend
# ports: ports:
# - '8088:8088' - '8088:8088'
depends_on:
- postgres
networks: networks:
promethai_mem_backend: promethai_mem_backend:

2
level_3/poetry.lock generated
View file

@ -6240,4 +6240,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p
[metadata] [metadata]
lock-version = "2.0" lock-version = "2.0"
python-versions = "^3.10" python-versions = "^3.10"
content-hash = "1700f1cab54ed107d299a47031ad53c58e5e72e3791b9113652fb4ff7854a91a" content-hash = "e2d17132884b261841ab4ae6dbe1ae0d91a940915a1c99fc47a0d1ed3f920c05"

View file

@ -41,6 +41,7 @@ unstructured = {extras = ["pdf"], version = "^0.10.23"}
sentence-transformers = "2.2.2" sentence-transformers = "2.2.2"
torch = "2.0.*" torch = "2.0.*"
segment-analytics-python = "^2.2.3" segment-analytics-python = "^2.2.3"
pdf2image = "^1.16.3"

View file

@ -204,7 +204,7 @@ def generate_param_variants(
# Default values # Default values
defaults = { defaults = {
"chunk_size": 250, "chunk_size": 750,
"chunk_overlap": 20, "chunk_overlap": 20,
"similarity_score": 0.5, "similarity_score": 0.5,
"metadata_variation": 0, "metadata_variation": 0,
@ -216,7 +216,7 @@ def generate_param_variants(
params = {**defaults, **(base_params or {})} params = {**defaults, **(base_params or {})}
default_increments = { default_increments = {
"chunk_size": 150, "chunk_size": 250,
"chunk_overlap": 10, "chunk_overlap": 10,
"similarity_score": 0.1, "similarity_score": 0.1,
"metadata_variation": 1, "metadata_variation": 1,
@ -615,7 +615,10 @@ async def start_test(
if loader_settings.get('search_type') == 'bm25': if loader_settings.get('search_type') == 'bm25':
return retrieve_action["data"]["Get"][test_id] return retrieve_action["data"]["Get"][test_id]
else: else:
return retrieve_action["data"]["Get"][test_id][0]["text"] try:
return retrieve_action["data"]["Get"][test_id][0]["text"]
except:
return retrieve_action["data"]["Get"][test_id]
async def run_eval(test_item, search_result): async def run_eval(test_item, search_result):
logging.info("Initiated test set evaluation") logging.info("Initiated test set evaluation")
@ -686,7 +689,7 @@ async def start_test(
metadata=metadata, metadata=metadata,
retriever_type=retriever_type, retriever_type=retriever_type,
) # No params for this case ) # No params for this case
results.append([result, "No params"]) results.append(result)
elif retriever_type == "single_document_context": elif retriever_type == "single_document_context":
logging.info("Retriever type: single document context") logging.info("Retriever type: single document context")
@ -697,43 +700,80 @@ async def start_test(
) # Add the params to the result ) # Add the params to the result
# result.append(param) # result.append(param)
results.append(result) results.append(result)
for b in results: for b in results:
logging.info("Loading %s", str(b)) logging.info("Loading %s", str(b))
for result, chunk in b: if retriever_type == "single_document_context":
logging.info("Loading %s", str(result)) for result, chunk in b:
await add_entity( logging.info("Loading %s", str(result))
session, await add_entity(
TestOutput( session,
id=test_id, TestOutput(
test_set_id=test_set_id, id=test_id,
operation_id=job_id, test_set_id=test_set_id,
set_id=str(uuid.uuid4()), operation_id=job_id,
user_id=user_id, set_id=str(uuid.uuid4()),
test_results=result["success"], user_id=user_id,
test_score=str(result["score"]), test_results=result["success"],
test_metric_name=result["metric_name"], test_score=str(result["score"]),
test_query=result["query"], test_metric_name=result["metric_name"],
test_output=result["output"], test_query=result["query"],
test_expected_output=str(["expected_output"]), test_output=result["output"],
test_context=result["context"][0], test_expected_output=str(["expected_output"]),
test_params=str(chunk), # Add params to the database table test_context=result["context"][0],
), test_params=str(chunk), # Add params to the database table
) ),
analytics.track(user_id, 'TestOutput', { )
'test_set_id': test_set_id, analytics.track(user_id, 'TestOutput', {
'operation_id': job_id, 'test_set_id': test_set_id,
'set_id' : str(uuid.uuid4()), 'operation_id': job_id,
'test_results' : result["success"], 'set_id' : str(uuid.uuid4()),
'test_score' : str(result["score"]), 'test_results' : result["success"],
'test_metric_name' : result["metric_name"], 'test_score' : str(result["score"]),
'test_query' : result["query"], 'test_metric_name' : result["metric_name"],
'test_output' : result["output"], 'test_query' : result["query"],
'test_expected_output' : str(["expected_output"]), 'test_output' : result["output"],
'test_context' : result["context"][0], 'test_expected_output' : str(["expected_output"]),
'test_params' : str(chunk), 'test_context' : result["context"][0],
}) 'test_params' : str(chunk),
analytics.flush() })
analytics.flush()
else:
chunk="None"
for result in b:
logging.info("Loading %s", str(result))
await add_entity(
session,
TestOutput(
id=test_id,
test_set_id=test_set_id,
operation_id=job_id,
set_id=str(uuid.uuid4()),
user_id=user_id,
test_results=result[0]["success"],
test_score=str(result[0]["score"]),
test_metric_name=result[0]["metric_name"],
test_query=result[0]["query"],
test_output=result[0]["output"],
test_expected_output=str(["expected_output"]),
test_context=result[0]["context"][0],
test_params=str(chunk), # Add params to the database table
),
)
analytics.track(user_id, 'TestOutput', {
'test_set_id': test_set_id,
'operation_id': job_id,
'set_id' : str(uuid.uuid4()),
'test_results' : result[0]["success"],
'test_score' : str(result[0]["score"]),
'test_metric_name' : result[0]["metric_name"],
'test_query' : result[0]["query"],
'test_output' : result[0]["output"],
'test_expected_output' : str(["expected_output"]),
'test_context' : result[0]["context"][0],
'test_params' : str(chunk),
})
analytics.flush()
await update_entity(session, Operation, job_id, "COMPLETED") await update_entity(session, Operation, job_id, "COMPLETED")

View file

@ -29,7 +29,10 @@ def vanilla_chunker(source_data, chunk_size=100, chunk_overlap=20):
chunk_overlap=chunk_overlap, chunk_overlap=chunk_overlap,
length_function=len length_function=len
) )
pages = text_splitter.create_documents([source_data]) try:
pages = text_splitter.create_documents([source_data])
except:
pages = text_splitter.create_documents(source_data.content)
# pages = source_data.load_and_split() # pages = source_data.load_and_split()
return pages return pages
def chunk_data_exact(data_chunks, chunk_size, chunk_overlap): def chunk_data_exact(data_chunks, chunk_size, chunk_overlap):

View file

@ -56,16 +56,20 @@ async def _document_loader( observation: str, loader_settings: dict):
if document_format == "PDF": if document_format == "PDF":
# loader = SimpleDirectoryReader(".data", recursive=True, exclude_hidden=True) # loader = SimpleDirectoryReader(".data", recursive=True, exclude_hidden=True)
documents = loader.load() documents = loader.load()
pages = chunk_data(chunk_strategy=loader_strategy, source_data=str(documents), chunk_size=chunk_size,
chunk_overlap=chunk_overlap)
logging.info("Documents: %s", documents) logging.info("Documents: %s", documents)
# pages = documents.load_and_split() # pages = documents.load_and_split()
chunked_doc.append(documents) chunked_doc.append(pages)
elif document_format == "TEXT": elif document_format == "TEXT":
documents = loader.load() documents = loader.load()
pages = chunk_data(chunk_strategy=loader_strategy, source_data=str(documents), chunk_size=chunk_size,
chunk_overlap=chunk_overlap)
logging.info("Documents: %s", documents) logging.info("Documents: %s", documents)
# pages = documents.load_and_split() # pages = documents.load_and_split()
chunked_doc.append(documents) chunked_doc.append(pages)
else: else:
raise ValueError(f"Error: ") raise ValueError(f"Error: ")