Update docs, fix issue with params, add tracking
This commit is contained in:
parent
3409d5bf61
commit
e7e5d9831e
11 changed files with 130 additions and 62 deletions
Binary file not shown.
BIN
level_3/.data/BartlebyTheScrivener.pdf
Normal file
BIN
level_3/.data/BartlebyTheScrivener.pdf
Normal file
Binary file not shown.
BIN
level_3/.data/CallofWild.pdf
Normal file
BIN
level_3/.data/CallofWild.pdf
Normal file
Binary file not shown.
Binary file not shown.
|
|
@ -13,7 +13,7 @@ Send the request to the API:
|
||||||
```
|
```
|
||||||
curl -X POST -H "Content-Type: application/json" -d '{
|
curl -X POST -H "Content-Type: application/json" -d '{
|
||||||
"payload": {
|
"payload": {
|
||||||
"user_id": "681",
|
"user_id": "97980cfea0067",
|
||||||
"data": [".data/3ZCCCW.pdf"],
|
"data": [".data/3ZCCCW.pdf"],
|
||||||
"test_set": "sample",
|
"test_set": "sample",
|
||||||
"params": ["chunk_size"],
|
"params": ["chunk_size"],
|
||||||
|
|
@ -81,7 +81,7 @@ After that, you can run the RAG test manager from your command line.
|
||||||
python rag_test_manager.py \
|
python rag_test_manager.py \
|
||||||
--file ".data" \
|
--file ".data" \
|
||||||
--test_set "example_data/test_set.json" \
|
--test_set "example_data/test_set.json" \
|
||||||
--user_id "666" \
|
--user_id "97980cfea0067" \
|
||||||
--params "chunk_size" "search_type" \
|
--params "chunk_size" "search_type" \
|
||||||
--metadata "example_data/metadata.json" \
|
--metadata "example_data/metadata.json" \
|
||||||
--retriever_type "single_document_context"
|
--retriever_type "single_document_context"
|
||||||
|
|
@ -89,3 +89,21 @@ After that, you can run the RAG test manager from your command line.
|
||||||
```
|
```
|
||||||
|
|
||||||
Examples of metadata structure and test set are in the folder "example_data"
|
Examples of metadata structure and test set are in the folder "example_data"
|
||||||
|
|
||||||
|
|
||||||
|
python rag_test_manager.py \
|
||||||
|
--file ".data" \
|
||||||
|
--test_set "example_data/test_set.json" \
|
||||||
|
--user_id "97980cfea0067" \
|
||||||
|
--params "chunk_size" "search_type" \
|
||||||
|
--metadata "example_data/metadata.json" \
|
||||||
|
--retriever_type "llm_context"
|
||||||
|
|
||||||
|
|
||||||
|
python rag_test_manager.py \
|
||||||
|
--file ".data" \
|
||||||
|
--test_set "example_data/test_set.json" \
|
||||||
|
--user_id "97980cfea0068" \
|
||||||
|
--params "chunk_size" "search_type", "overlap" \
|
||||||
|
--metadata "example_data/metadata.json" \
|
||||||
|
--retriever_type "single_document_context"
|
||||||
|
|
|
||||||
|
|
@ -51,23 +51,25 @@ services:
|
||||||
ports:
|
ports:
|
||||||
- "5432:5432"
|
- "5432:5432"
|
||||||
|
|
||||||
# superset:
|
superset:
|
||||||
# platform: linux/amd64
|
platform: linux/amd64
|
||||||
# build:
|
build:
|
||||||
# context: ./superset
|
context: ./superset
|
||||||
# dockerfile: Dockerfile
|
dockerfile: Dockerfile
|
||||||
# container_name: superset
|
container_name: superset
|
||||||
# environment:
|
environment:
|
||||||
# - ADMIN_USERNAME=admin
|
- ADMIN_USERNAME=admin
|
||||||
# - ADMIN_EMAIL=vasilije@topoteretes.com
|
- ADMIN_EMAIL=vasilije@topoteretes.com
|
||||||
# - ADMIN_PASSWORD=admin
|
- ADMIN_PASSWORD=admin
|
||||||
# - POSTGRES_USER=bla
|
- POSTGRES_USER=bla
|
||||||
# - POSTGRES_PASSWORD=bla
|
- POSTGRES_PASSWORD=bla
|
||||||
# - POSTGRES_DB=bubu
|
- POSTGRES_DB=bubu
|
||||||
# networks:
|
networks:
|
||||||
# - promethai_mem_backend
|
- promethai_mem_backend
|
||||||
# ports:
|
ports:
|
||||||
# - '8088:8088'
|
- '8088:8088'
|
||||||
|
depends_on:
|
||||||
|
- postgres
|
||||||
|
|
||||||
networks:
|
networks:
|
||||||
promethai_mem_backend:
|
promethai_mem_backend:
|
||||||
|
|
|
||||||
2
level_3/poetry.lock
generated
2
level_3/poetry.lock
generated
|
|
@ -6240,4 +6240,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p
|
||||||
[metadata]
|
[metadata]
|
||||||
lock-version = "2.0"
|
lock-version = "2.0"
|
||||||
python-versions = "^3.10"
|
python-versions = "^3.10"
|
||||||
content-hash = "1700f1cab54ed107d299a47031ad53c58e5e72e3791b9113652fb4ff7854a91a"
|
content-hash = "e2d17132884b261841ab4ae6dbe1ae0d91a940915a1c99fc47a0d1ed3f920c05"
|
||||||
|
|
|
||||||
|
|
@ -41,6 +41,7 @@ unstructured = {extras = ["pdf"], version = "^0.10.23"}
|
||||||
sentence-transformers = "2.2.2"
|
sentence-transformers = "2.2.2"
|
||||||
torch = "2.0.*"
|
torch = "2.0.*"
|
||||||
segment-analytics-python = "^2.2.3"
|
segment-analytics-python = "^2.2.3"
|
||||||
|
pdf2image = "^1.16.3"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -204,7 +204,7 @@ def generate_param_variants(
|
||||||
|
|
||||||
# Default values
|
# Default values
|
||||||
defaults = {
|
defaults = {
|
||||||
"chunk_size": 250,
|
"chunk_size": 750,
|
||||||
"chunk_overlap": 20,
|
"chunk_overlap": 20,
|
||||||
"similarity_score": 0.5,
|
"similarity_score": 0.5,
|
||||||
"metadata_variation": 0,
|
"metadata_variation": 0,
|
||||||
|
|
@ -216,7 +216,7 @@ def generate_param_variants(
|
||||||
params = {**defaults, **(base_params or {})}
|
params = {**defaults, **(base_params or {})}
|
||||||
|
|
||||||
default_increments = {
|
default_increments = {
|
||||||
"chunk_size": 150,
|
"chunk_size": 250,
|
||||||
"chunk_overlap": 10,
|
"chunk_overlap": 10,
|
||||||
"similarity_score": 0.1,
|
"similarity_score": 0.1,
|
||||||
"metadata_variation": 1,
|
"metadata_variation": 1,
|
||||||
|
|
@ -615,7 +615,10 @@ async def start_test(
|
||||||
if loader_settings.get('search_type') == 'bm25':
|
if loader_settings.get('search_type') == 'bm25':
|
||||||
return retrieve_action["data"]["Get"][test_id]
|
return retrieve_action["data"]["Get"][test_id]
|
||||||
else:
|
else:
|
||||||
return retrieve_action["data"]["Get"][test_id][0]["text"]
|
try:
|
||||||
|
return retrieve_action["data"]["Get"][test_id][0]["text"]
|
||||||
|
except:
|
||||||
|
return retrieve_action["data"]["Get"][test_id]
|
||||||
|
|
||||||
async def run_eval(test_item, search_result):
|
async def run_eval(test_item, search_result):
|
||||||
logging.info("Initiated test set evaluation")
|
logging.info("Initiated test set evaluation")
|
||||||
|
|
@ -686,7 +689,7 @@ async def start_test(
|
||||||
metadata=metadata,
|
metadata=metadata,
|
||||||
retriever_type=retriever_type,
|
retriever_type=retriever_type,
|
||||||
) # No params for this case
|
) # No params for this case
|
||||||
results.append([result, "No params"])
|
results.append(result)
|
||||||
|
|
||||||
elif retriever_type == "single_document_context":
|
elif retriever_type == "single_document_context":
|
||||||
logging.info("Retriever type: single document context")
|
logging.info("Retriever type: single document context")
|
||||||
|
|
@ -697,43 +700,80 @@ async def start_test(
|
||||||
) # Add the params to the result
|
) # Add the params to the result
|
||||||
# result.append(param)
|
# result.append(param)
|
||||||
results.append(result)
|
results.append(result)
|
||||||
|
|
||||||
for b in results:
|
for b in results:
|
||||||
logging.info("Loading %s", str(b))
|
logging.info("Loading %s", str(b))
|
||||||
for result, chunk in b:
|
if retriever_type == "single_document_context":
|
||||||
logging.info("Loading %s", str(result))
|
for result, chunk in b:
|
||||||
await add_entity(
|
logging.info("Loading %s", str(result))
|
||||||
session,
|
await add_entity(
|
||||||
TestOutput(
|
session,
|
||||||
id=test_id,
|
TestOutput(
|
||||||
test_set_id=test_set_id,
|
id=test_id,
|
||||||
operation_id=job_id,
|
test_set_id=test_set_id,
|
||||||
set_id=str(uuid.uuid4()),
|
operation_id=job_id,
|
||||||
user_id=user_id,
|
set_id=str(uuid.uuid4()),
|
||||||
test_results=result["success"],
|
user_id=user_id,
|
||||||
test_score=str(result["score"]),
|
test_results=result["success"],
|
||||||
test_metric_name=result["metric_name"],
|
test_score=str(result["score"]),
|
||||||
test_query=result["query"],
|
test_metric_name=result["metric_name"],
|
||||||
test_output=result["output"],
|
test_query=result["query"],
|
||||||
test_expected_output=str(["expected_output"]),
|
test_output=result["output"],
|
||||||
test_context=result["context"][0],
|
test_expected_output=str(["expected_output"]),
|
||||||
test_params=str(chunk), # Add params to the database table
|
test_context=result["context"][0],
|
||||||
),
|
test_params=str(chunk), # Add params to the database table
|
||||||
)
|
),
|
||||||
analytics.track(user_id, 'TestOutput', {
|
)
|
||||||
'test_set_id': test_set_id,
|
analytics.track(user_id, 'TestOutput', {
|
||||||
'operation_id': job_id,
|
'test_set_id': test_set_id,
|
||||||
'set_id' : str(uuid.uuid4()),
|
'operation_id': job_id,
|
||||||
'test_results' : result["success"],
|
'set_id' : str(uuid.uuid4()),
|
||||||
'test_score' : str(result["score"]),
|
'test_results' : result["success"],
|
||||||
'test_metric_name' : result["metric_name"],
|
'test_score' : str(result["score"]),
|
||||||
'test_query' : result["query"],
|
'test_metric_name' : result["metric_name"],
|
||||||
'test_output' : result["output"],
|
'test_query' : result["query"],
|
||||||
'test_expected_output' : str(["expected_output"]),
|
'test_output' : result["output"],
|
||||||
'test_context' : result["context"][0],
|
'test_expected_output' : str(["expected_output"]),
|
||||||
'test_params' : str(chunk),
|
'test_context' : result["context"][0],
|
||||||
})
|
'test_params' : str(chunk),
|
||||||
analytics.flush()
|
})
|
||||||
|
analytics.flush()
|
||||||
|
else:
|
||||||
|
chunk="None"
|
||||||
|
for result in b:
|
||||||
|
logging.info("Loading %s", str(result))
|
||||||
|
await add_entity(
|
||||||
|
session,
|
||||||
|
TestOutput(
|
||||||
|
id=test_id,
|
||||||
|
test_set_id=test_set_id,
|
||||||
|
operation_id=job_id,
|
||||||
|
set_id=str(uuid.uuid4()),
|
||||||
|
user_id=user_id,
|
||||||
|
test_results=result[0]["success"],
|
||||||
|
test_score=str(result[0]["score"]),
|
||||||
|
test_metric_name=result[0]["metric_name"],
|
||||||
|
test_query=result[0]["query"],
|
||||||
|
test_output=result[0]["output"],
|
||||||
|
test_expected_output=str(["expected_output"]),
|
||||||
|
test_context=result[0]["context"][0],
|
||||||
|
test_params=str(chunk), # Add params to the database table
|
||||||
|
),
|
||||||
|
)
|
||||||
|
analytics.track(user_id, 'TestOutput', {
|
||||||
|
'test_set_id': test_set_id,
|
||||||
|
'operation_id': job_id,
|
||||||
|
'set_id' : str(uuid.uuid4()),
|
||||||
|
'test_results' : result[0]["success"],
|
||||||
|
'test_score' : str(result[0]["score"]),
|
||||||
|
'test_metric_name' : result[0]["metric_name"],
|
||||||
|
'test_query' : result[0]["query"],
|
||||||
|
'test_output' : result[0]["output"],
|
||||||
|
'test_expected_output' : str(["expected_output"]),
|
||||||
|
'test_context' : result[0]["context"][0],
|
||||||
|
'test_params' : str(chunk),
|
||||||
|
})
|
||||||
|
analytics.flush()
|
||||||
|
|
||||||
|
|
||||||
await update_entity(session, Operation, job_id, "COMPLETED")
|
await update_entity(session, Operation, job_id, "COMPLETED")
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -29,7 +29,10 @@ def vanilla_chunker(source_data, chunk_size=100, chunk_overlap=20):
|
||||||
chunk_overlap=chunk_overlap,
|
chunk_overlap=chunk_overlap,
|
||||||
length_function=len
|
length_function=len
|
||||||
)
|
)
|
||||||
pages = text_splitter.create_documents([source_data])
|
try:
|
||||||
|
pages = text_splitter.create_documents([source_data])
|
||||||
|
except:
|
||||||
|
pages = text_splitter.create_documents(source_data.content)
|
||||||
# pages = source_data.load_and_split()
|
# pages = source_data.load_and_split()
|
||||||
return pages
|
return pages
|
||||||
def chunk_data_exact(data_chunks, chunk_size, chunk_overlap):
|
def chunk_data_exact(data_chunks, chunk_size, chunk_overlap):
|
||||||
|
|
|
||||||
|
|
@ -56,16 +56,20 @@ async def _document_loader( observation: str, loader_settings: dict):
|
||||||
if document_format == "PDF":
|
if document_format == "PDF":
|
||||||
# loader = SimpleDirectoryReader(".data", recursive=True, exclude_hidden=True)
|
# loader = SimpleDirectoryReader(".data", recursive=True, exclude_hidden=True)
|
||||||
documents = loader.load()
|
documents = loader.load()
|
||||||
|
pages = chunk_data(chunk_strategy=loader_strategy, source_data=str(documents), chunk_size=chunk_size,
|
||||||
|
chunk_overlap=chunk_overlap)
|
||||||
logging.info("Documents: %s", documents)
|
logging.info("Documents: %s", documents)
|
||||||
# pages = documents.load_and_split()
|
# pages = documents.load_and_split()
|
||||||
chunked_doc.append(documents)
|
chunked_doc.append(pages)
|
||||||
|
|
||||||
|
|
||||||
elif document_format == "TEXT":
|
elif document_format == "TEXT":
|
||||||
documents = loader.load()
|
documents = loader.load()
|
||||||
|
pages = chunk_data(chunk_strategy=loader_strategy, source_data=str(documents), chunk_size=chunk_size,
|
||||||
|
chunk_overlap=chunk_overlap)
|
||||||
logging.info("Documents: %s", documents)
|
logging.info("Documents: %s", documents)
|
||||||
# pages = documents.load_and_split()
|
# pages = documents.load_and_split()
|
||||||
chunked_doc.append(documents)
|
chunked_doc.append(pages)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Error: ")
|
raise ValueError(f"Error: ")
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue