Update docs, fix issue with params, add tracking

This commit is contained in:
Vasilije 2023-10-31 00:29:45 +01:00
parent 3409d5bf61
commit e7e5d9831e
11 changed files with 130 additions and 62 deletions

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View file

@ -13,7 +13,7 @@ Send the request to the API:
```
curl -X POST -H "Content-Type: application/json" -d '{
"payload": {
"user_id": "681",
"user_id": "97980cfea0067",
"data": [".data/3ZCCCW.pdf"],
"test_set": "sample",
"params": ["chunk_size"],
@ -81,7 +81,7 @@ After that, you can run the RAG test manager from your command line.
python rag_test_manager.py \
--file ".data" \
--test_set "example_data/test_set.json" \
--user_id "666" \
--user_id "97980cfea0067" \
--params "chunk_size" "search_type" \
--metadata "example_data/metadata.json" \
--retriever_type "single_document_context"
@ -89,3 +89,21 @@ After that, you can run the RAG test manager from your command line.
```
Examples of metadata structure and test set are in the folder "example_data"
python rag_test_manager.py \
--file ".data" \
--test_set "example_data/test_set.json" \
--user_id "97980cfea0067" \
--params "chunk_size" "search_type" \
--metadata "example_data/metadata.json" \
--retriever_type "llm_context"
python rag_test_manager.py \
--file ".data" \
--test_set "example_data/test_set.json" \
--user_id "97980cfea0068" \
--params "chunk_size" "search_type", "overlap" \
--metadata "example_data/metadata.json" \
--retriever_type "single_document_context"

View file

@ -51,23 +51,25 @@ services:
ports:
- "5432:5432"
# superset:
# platform: linux/amd64
# build:
# context: ./superset
# dockerfile: Dockerfile
# container_name: superset
# environment:
# - ADMIN_USERNAME=admin
# - ADMIN_EMAIL=vasilije@topoteretes.com
# - ADMIN_PASSWORD=admin
# - POSTGRES_USER=bla
# - POSTGRES_PASSWORD=bla
# - POSTGRES_DB=bubu
# networks:
# - promethai_mem_backend
# ports:
# - '8088:8088'
superset:
platform: linux/amd64
build:
context: ./superset
dockerfile: Dockerfile
container_name: superset
environment:
- ADMIN_USERNAME=admin
- ADMIN_EMAIL=vasilije@topoteretes.com
- ADMIN_PASSWORD=admin
- POSTGRES_USER=bla
- POSTGRES_PASSWORD=bla
- POSTGRES_DB=bubu
networks:
- promethai_mem_backend
ports:
- '8088:8088'
depends_on:
- postgres
networks:
promethai_mem_backend:

2
level_3/poetry.lock generated
View file

@ -6240,4 +6240,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p
[metadata]
lock-version = "2.0"
python-versions = "^3.10"
content-hash = "1700f1cab54ed107d299a47031ad53c58e5e72e3791b9113652fb4ff7854a91a"
content-hash = "e2d17132884b261841ab4ae6dbe1ae0d91a940915a1c99fc47a0d1ed3f920c05"

View file

@ -41,6 +41,7 @@ unstructured = {extras = ["pdf"], version = "^0.10.23"}
sentence-transformers = "2.2.2"
torch = "2.0.*"
segment-analytics-python = "^2.2.3"
pdf2image = "^1.16.3"

View file

@ -204,7 +204,7 @@ def generate_param_variants(
# Default values
defaults = {
"chunk_size": 250,
"chunk_size": 750,
"chunk_overlap": 20,
"similarity_score": 0.5,
"metadata_variation": 0,
@ -216,7 +216,7 @@ def generate_param_variants(
params = {**defaults, **(base_params or {})}
default_increments = {
"chunk_size": 150,
"chunk_size": 250,
"chunk_overlap": 10,
"similarity_score": 0.1,
"metadata_variation": 1,
@ -615,7 +615,10 @@ async def start_test(
if loader_settings.get('search_type') == 'bm25':
return retrieve_action["data"]["Get"][test_id]
else:
return retrieve_action["data"]["Get"][test_id][0]["text"]
try:
return retrieve_action["data"]["Get"][test_id][0]["text"]
except:
return retrieve_action["data"]["Get"][test_id]
async def run_eval(test_item, search_result):
logging.info("Initiated test set evaluation")
@ -686,7 +689,7 @@ async def start_test(
metadata=metadata,
retriever_type=retriever_type,
) # No params for this case
results.append([result, "No params"])
results.append(result)
elif retriever_type == "single_document_context":
logging.info("Retriever type: single document context")
@ -697,43 +700,80 @@ async def start_test(
) # Add the params to the result
# result.append(param)
results.append(result)
for b in results:
logging.info("Loading %s", str(b))
for result, chunk in b:
logging.info("Loading %s", str(result))
await add_entity(
session,
TestOutput(
id=test_id,
test_set_id=test_set_id,
operation_id=job_id,
set_id=str(uuid.uuid4()),
user_id=user_id,
test_results=result["success"],
test_score=str(result["score"]),
test_metric_name=result["metric_name"],
test_query=result["query"],
test_output=result["output"],
test_expected_output=str(["expected_output"]),
test_context=result["context"][0],
test_params=str(chunk), # Add params to the database table
),
)
analytics.track(user_id, 'TestOutput', {
'test_set_id': test_set_id,
'operation_id': job_id,
'set_id' : str(uuid.uuid4()),
'test_results' : result["success"],
'test_score' : str(result["score"]),
'test_metric_name' : result["metric_name"],
'test_query' : result["query"],
'test_output' : result["output"],
'test_expected_output' : str(["expected_output"]),
'test_context' : result["context"][0],
'test_params' : str(chunk),
})
analytics.flush()
if retriever_type == "single_document_context":
for result, chunk in b:
logging.info("Loading %s", str(result))
await add_entity(
session,
TestOutput(
id=test_id,
test_set_id=test_set_id,
operation_id=job_id,
set_id=str(uuid.uuid4()),
user_id=user_id,
test_results=result["success"],
test_score=str(result["score"]),
test_metric_name=result["metric_name"],
test_query=result["query"],
test_output=result["output"],
test_expected_output=str(["expected_output"]),
test_context=result["context"][0],
test_params=str(chunk), # Add params to the database table
),
)
analytics.track(user_id, 'TestOutput', {
'test_set_id': test_set_id,
'operation_id': job_id,
'set_id' : str(uuid.uuid4()),
'test_results' : result["success"],
'test_score' : str(result["score"]),
'test_metric_name' : result["metric_name"],
'test_query' : result["query"],
'test_output' : result["output"],
'test_expected_output' : str(["expected_output"]),
'test_context' : result["context"][0],
'test_params' : str(chunk),
})
analytics.flush()
else:
chunk="None"
for result in b:
logging.info("Loading %s", str(result))
await add_entity(
session,
TestOutput(
id=test_id,
test_set_id=test_set_id,
operation_id=job_id,
set_id=str(uuid.uuid4()),
user_id=user_id,
test_results=result[0]["success"],
test_score=str(result[0]["score"]),
test_metric_name=result[0]["metric_name"],
test_query=result[0]["query"],
test_output=result[0]["output"],
test_expected_output=str(["expected_output"]),
test_context=result[0]["context"][0],
test_params=str(chunk), # Add params to the database table
),
)
analytics.track(user_id, 'TestOutput', {
'test_set_id': test_set_id,
'operation_id': job_id,
'set_id' : str(uuid.uuid4()),
'test_results' : result[0]["success"],
'test_score' : str(result[0]["score"]),
'test_metric_name' : result[0]["metric_name"],
'test_query' : result[0]["query"],
'test_output' : result[0]["output"],
'test_expected_output' : str(["expected_output"]),
'test_context' : result[0]["context"][0],
'test_params' : str(chunk),
})
analytics.flush()
await update_entity(session, Operation, job_id, "COMPLETED")

View file

@ -29,7 +29,10 @@ def vanilla_chunker(source_data, chunk_size=100, chunk_overlap=20):
chunk_overlap=chunk_overlap,
length_function=len
)
pages = text_splitter.create_documents([source_data])
try:
pages = text_splitter.create_documents([source_data])
except:
pages = text_splitter.create_documents(source_data.content)
# pages = source_data.load_and_split()
return pages
def chunk_data_exact(data_chunks, chunk_size, chunk_overlap):

View file

@ -56,16 +56,20 @@ async def _document_loader( observation: str, loader_settings: dict):
if document_format == "PDF":
# loader = SimpleDirectoryReader(".data", recursive=True, exclude_hidden=True)
documents = loader.load()
pages = chunk_data(chunk_strategy=loader_strategy, source_data=str(documents), chunk_size=chunk_size,
chunk_overlap=chunk_overlap)
logging.info("Documents: %s", documents)
# pages = documents.load_and_split()
chunked_doc.append(documents)
chunked_doc.append(pages)
elif document_format == "TEXT":
documents = loader.load()
pages = chunk_data(chunk_strategy=loader_strategy, source_data=str(documents), chunk_size=chunk_size,
chunk_overlap=chunk_overlap)
logging.info("Documents: %s", documents)
# pages = documents.load_and_split()
chunked_doc.append(documents)
chunked_doc.append(pages)
else:
raise ValueError(f"Error: ")