Update docs, fix issue with params, add tracking
This commit is contained in:
parent
3409d5bf61
commit
e7e5d9831e
11 changed files with 130 additions and 62 deletions
Binary file not shown.
BIN
level_3/.data/BartlebyTheScrivener.pdf
Normal file
BIN
level_3/.data/BartlebyTheScrivener.pdf
Normal file
Binary file not shown.
BIN
level_3/.data/CallofWild.pdf
Normal file
BIN
level_3/.data/CallofWild.pdf
Normal file
Binary file not shown.
Binary file not shown.
|
|
@ -13,7 +13,7 @@ Send the request to the API:
|
|||
```
|
||||
curl -X POST -H "Content-Type: application/json" -d '{
|
||||
"payload": {
|
||||
"user_id": "681",
|
||||
"user_id": "97980cfea0067",
|
||||
"data": [".data/3ZCCCW.pdf"],
|
||||
"test_set": "sample",
|
||||
"params": ["chunk_size"],
|
||||
|
|
@ -81,7 +81,7 @@ After that, you can run the RAG test manager from your command line.
|
|||
python rag_test_manager.py \
|
||||
--file ".data" \
|
||||
--test_set "example_data/test_set.json" \
|
||||
--user_id "666" \
|
||||
--user_id "97980cfea0067" \
|
||||
--params "chunk_size" "search_type" \
|
||||
--metadata "example_data/metadata.json" \
|
||||
--retriever_type "single_document_context"
|
||||
|
|
@ -89,3 +89,21 @@ After that, you can run the RAG test manager from your command line.
|
|||
```
|
||||
|
||||
Examples of metadata structure and test set are in the folder "example_data"
|
||||
|
||||
|
||||
python rag_test_manager.py \
|
||||
--file ".data" \
|
||||
--test_set "example_data/test_set.json" \
|
||||
--user_id "97980cfea0067" \
|
||||
--params "chunk_size" "search_type" \
|
||||
--metadata "example_data/metadata.json" \
|
||||
--retriever_type "llm_context"
|
||||
|
||||
|
||||
python rag_test_manager.py \
|
||||
--file ".data" \
|
||||
--test_set "example_data/test_set.json" \
|
||||
--user_id "97980cfea0068" \
|
||||
--params "chunk_size" "search_type", "overlap" \
|
||||
--metadata "example_data/metadata.json" \
|
||||
--retriever_type "single_document_context"
|
||||
|
|
|
|||
|
|
@ -51,23 +51,25 @@ services:
|
|||
ports:
|
||||
- "5432:5432"
|
||||
|
||||
# superset:
|
||||
# platform: linux/amd64
|
||||
# build:
|
||||
# context: ./superset
|
||||
# dockerfile: Dockerfile
|
||||
# container_name: superset
|
||||
# environment:
|
||||
# - ADMIN_USERNAME=admin
|
||||
# - ADMIN_EMAIL=vasilije@topoteretes.com
|
||||
# - ADMIN_PASSWORD=admin
|
||||
# - POSTGRES_USER=bla
|
||||
# - POSTGRES_PASSWORD=bla
|
||||
# - POSTGRES_DB=bubu
|
||||
# networks:
|
||||
# - promethai_mem_backend
|
||||
# ports:
|
||||
# - '8088:8088'
|
||||
superset:
|
||||
platform: linux/amd64
|
||||
build:
|
||||
context: ./superset
|
||||
dockerfile: Dockerfile
|
||||
container_name: superset
|
||||
environment:
|
||||
- ADMIN_USERNAME=admin
|
||||
- ADMIN_EMAIL=vasilije@topoteretes.com
|
||||
- ADMIN_PASSWORD=admin
|
||||
- POSTGRES_USER=bla
|
||||
- POSTGRES_PASSWORD=bla
|
||||
- POSTGRES_DB=bubu
|
||||
networks:
|
||||
- promethai_mem_backend
|
||||
ports:
|
||||
- '8088:8088'
|
||||
depends_on:
|
||||
- postgres
|
||||
|
||||
networks:
|
||||
promethai_mem_backend:
|
||||
|
|
|
|||
2
level_3/poetry.lock
generated
2
level_3/poetry.lock
generated
|
|
@ -6240,4 +6240,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p
|
|||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = "^3.10"
|
||||
content-hash = "1700f1cab54ed107d299a47031ad53c58e5e72e3791b9113652fb4ff7854a91a"
|
||||
content-hash = "e2d17132884b261841ab4ae6dbe1ae0d91a940915a1c99fc47a0d1ed3f920c05"
|
||||
|
|
|
|||
|
|
@ -41,6 +41,7 @@ unstructured = {extras = ["pdf"], version = "^0.10.23"}
|
|||
sentence-transformers = "2.2.2"
|
||||
torch = "2.0.*"
|
||||
segment-analytics-python = "^2.2.3"
|
||||
pdf2image = "^1.16.3"
|
||||
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -204,7 +204,7 @@ def generate_param_variants(
|
|||
|
||||
# Default values
|
||||
defaults = {
|
||||
"chunk_size": 250,
|
||||
"chunk_size": 750,
|
||||
"chunk_overlap": 20,
|
||||
"similarity_score": 0.5,
|
||||
"metadata_variation": 0,
|
||||
|
|
@ -216,7 +216,7 @@ def generate_param_variants(
|
|||
params = {**defaults, **(base_params or {})}
|
||||
|
||||
default_increments = {
|
||||
"chunk_size": 150,
|
||||
"chunk_size": 250,
|
||||
"chunk_overlap": 10,
|
||||
"similarity_score": 0.1,
|
||||
"metadata_variation": 1,
|
||||
|
|
@ -615,7 +615,10 @@ async def start_test(
|
|||
if loader_settings.get('search_type') == 'bm25':
|
||||
return retrieve_action["data"]["Get"][test_id]
|
||||
else:
|
||||
return retrieve_action["data"]["Get"][test_id][0]["text"]
|
||||
try:
|
||||
return retrieve_action["data"]["Get"][test_id][0]["text"]
|
||||
except:
|
||||
return retrieve_action["data"]["Get"][test_id]
|
||||
|
||||
async def run_eval(test_item, search_result):
|
||||
logging.info("Initiated test set evaluation")
|
||||
|
|
@ -686,7 +689,7 @@ async def start_test(
|
|||
metadata=metadata,
|
||||
retriever_type=retriever_type,
|
||||
) # No params for this case
|
||||
results.append([result, "No params"])
|
||||
results.append(result)
|
||||
|
||||
elif retriever_type == "single_document_context":
|
||||
logging.info("Retriever type: single document context")
|
||||
|
|
@ -697,43 +700,80 @@ async def start_test(
|
|||
) # Add the params to the result
|
||||
# result.append(param)
|
||||
results.append(result)
|
||||
|
||||
for b in results:
|
||||
logging.info("Loading %s", str(b))
|
||||
for result, chunk in b:
|
||||
logging.info("Loading %s", str(result))
|
||||
await add_entity(
|
||||
session,
|
||||
TestOutput(
|
||||
id=test_id,
|
||||
test_set_id=test_set_id,
|
||||
operation_id=job_id,
|
||||
set_id=str(uuid.uuid4()),
|
||||
user_id=user_id,
|
||||
test_results=result["success"],
|
||||
test_score=str(result["score"]),
|
||||
test_metric_name=result["metric_name"],
|
||||
test_query=result["query"],
|
||||
test_output=result["output"],
|
||||
test_expected_output=str(["expected_output"]),
|
||||
test_context=result["context"][0],
|
||||
test_params=str(chunk), # Add params to the database table
|
||||
),
|
||||
)
|
||||
analytics.track(user_id, 'TestOutput', {
|
||||
'test_set_id': test_set_id,
|
||||
'operation_id': job_id,
|
||||
'set_id' : str(uuid.uuid4()),
|
||||
'test_results' : result["success"],
|
||||
'test_score' : str(result["score"]),
|
||||
'test_metric_name' : result["metric_name"],
|
||||
'test_query' : result["query"],
|
||||
'test_output' : result["output"],
|
||||
'test_expected_output' : str(["expected_output"]),
|
||||
'test_context' : result["context"][0],
|
||||
'test_params' : str(chunk),
|
||||
})
|
||||
analytics.flush()
|
||||
if retriever_type == "single_document_context":
|
||||
for result, chunk in b:
|
||||
logging.info("Loading %s", str(result))
|
||||
await add_entity(
|
||||
session,
|
||||
TestOutput(
|
||||
id=test_id,
|
||||
test_set_id=test_set_id,
|
||||
operation_id=job_id,
|
||||
set_id=str(uuid.uuid4()),
|
||||
user_id=user_id,
|
||||
test_results=result["success"],
|
||||
test_score=str(result["score"]),
|
||||
test_metric_name=result["metric_name"],
|
||||
test_query=result["query"],
|
||||
test_output=result["output"],
|
||||
test_expected_output=str(["expected_output"]),
|
||||
test_context=result["context"][0],
|
||||
test_params=str(chunk), # Add params to the database table
|
||||
),
|
||||
)
|
||||
analytics.track(user_id, 'TestOutput', {
|
||||
'test_set_id': test_set_id,
|
||||
'operation_id': job_id,
|
||||
'set_id' : str(uuid.uuid4()),
|
||||
'test_results' : result["success"],
|
||||
'test_score' : str(result["score"]),
|
||||
'test_metric_name' : result["metric_name"],
|
||||
'test_query' : result["query"],
|
||||
'test_output' : result["output"],
|
||||
'test_expected_output' : str(["expected_output"]),
|
||||
'test_context' : result["context"][0],
|
||||
'test_params' : str(chunk),
|
||||
})
|
||||
analytics.flush()
|
||||
else:
|
||||
chunk="None"
|
||||
for result in b:
|
||||
logging.info("Loading %s", str(result))
|
||||
await add_entity(
|
||||
session,
|
||||
TestOutput(
|
||||
id=test_id,
|
||||
test_set_id=test_set_id,
|
||||
operation_id=job_id,
|
||||
set_id=str(uuid.uuid4()),
|
||||
user_id=user_id,
|
||||
test_results=result[0]["success"],
|
||||
test_score=str(result[0]["score"]),
|
||||
test_metric_name=result[0]["metric_name"],
|
||||
test_query=result[0]["query"],
|
||||
test_output=result[0]["output"],
|
||||
test_expected_output=str(["expected_output"]),
|
||||
test_context=result[0]["context"][0],
|
||||
test_params=str(chunk), # Add params to the database table
|
||||
),
|
||||
)
|
||||
analytics.track(user_id, 'TestOutput', {
|
||||
'test_set_id': test_set_id,
|
||||
'operation_id': job_id,
|
||||
'set_id' : str(uuid.uuid4()),
|
||||
'test_results' : result[0]["success"],
|
||||
'test_score' : str(result[0]["score"]),
|
||||
'test_metric_name' : result[0]["metric_name"],
|
||||
'test_query' : result[0]["query"],
|
||||
'test_output' : result[0]["output"],
|
||||
'test_expected_output' : str(["expected_output"]),
|
||||
'test_context' : result[0]["context"][0],
|
||||
'test_params' : str(chunk),
|
||||
})
|
||||
analytics.flush()
|
||||
|
||||
|
||||
await update_entity(session, Operation, job_id, "COMPLETED")
|
||||
|
||||
|
|
|
|||
|
|
@ -29,7 +29,10 @@ def vanilla_chunker(source_data, chunk_size=100, chunk_overlap=20):
|
|||
chunk_overlap=chunk_overlap,
|
||||
length_function=len
|
||||
)
|
||||
pages = text_splitter.create_documents([source_data])
|
||||
try:
|
||||
pages = text_splitter.create_documents([source_data])
|
||||
except:
|
||||
pages = text_splitter.create_documents(source_data.content)
|
||||
# pages = source_data.load_and_split()
|
||||
return pages
|
||||
def chunk_data_exact(data_chunks, chunk_size, chunk_overlap):
|
||||
|
|
|
|||
|
|
@ -56,16 +56,20 @@ async def _document_loader( observation: str, loader_settings: dict):
|
|||
if document_format == "PDF":
|
||||
# loader = SimpleDirectoryReader(".data", recursive=True, exclude_hidden=True)
|
||||
documents = loader.load()
|
||||
pages = chunk_data(chunk_strategy=loader_strategy, source_data=str(documents), chunk_size=chunk_size,
|
||||
chunk_overlap=chunk_overlap)
|
||||
logging.info("Documents: %s", documents)
|
||||
# pages = documents.load_and_split()
|
||||
chunked_doc.append(documents)
|
||||
chunked_doc.append(pages)
|
||||
|
||||
|
||||
elif document_format == "TEXT":
|
||||
documents = loader.load()
|
||||
pages = chunk_data(chunk_strategy=loader_strategy, source_data=str(documents), chunk_size=chunk_size,
|
||||
chunk_overlap=chunk_overlap)
|
||||
logging.info("Documents: %s", documents)
|
||||
# pages = documents.load_and_split()
|
||||
chunked_doc.append(documents)
|
||||
chunked_doc.append(pages)
|
||||
|
||||
else:
|
||||
raise ValueError(f"Error: ")
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue