diff --git a/level_3/.data/3ZCCCW.pdf b/level_3/.data/3ZCCCW.pdf deleted file mode 100644 index e610a35c0..000000000 Binary files a/level_3/.data/3ZCCCW.pdf and /dev/null differ diff --git a/level_3/.data/BartlebyTheScrivener.pdf b/level_3/.data/BartlebyTheScrivener.pdf new file mode 100644 index 000000000..1a38dd3de Binary files /dev/null and b/level_3/.data/BartlebyTheScrivener.pdf differ diff --git a/level_3/.data/CallofWild.pdf b/level_3/.data/CallofWild.pdf new file mode 100644 index 000000000..8b3a7fcb0 Binary files /dev/null and b/level_3/.data/CallofWild.pdf differ diff --git a/level_3/.data/dssd.pdf b/level_3/.data/dssd.pdf deleted file mode 100644 index e610a35c0..000000000 Binary files a/level_3/.data/dssd.pdf and /dev/null differ diff --git a/level_3/Readme.md b/level_3/Readme.md index 3987728fe..5a54d1e28 100644 --- a/level_3/Readme.md +++ b/level_3/Readme.md @@ -13,7 +13,7 @@ Send the request to the API: ``` curl -X POST -H "Content-Type: application/json" -d '{ "payload": { - "user_id": "681", + "user_id": "97980cfea0067", "data": [".data/3ZCCCW.pdf"], "test_set": "sample", "params": ["chunk_size"], @@ -81,7 +81,7 @@ After that, you can run the RAG test manager from your command line. python rag_test_manager.py \ --file ".data" \ --test_set "example_data/test_set.json" \ - --user_id "666" \ + --user_id "97980cfea0067" \ --params "chunk_size" "search_type" \ --metadata "example_data/metadata.json" \ --retriever_type "single_document_context" @@ -89,3 +89,21 @@ After that, you can run the RAG test manager from your command line. ``` Examples of metadata structure and test set are in the folder "example_data" + + + python rag_test_manager.py \ + --file ".data" \ + --test_set "example_data/test_set.json" \ + --user_id "97980cfea0067" \ + --params "chunk_size" "search_type" \ + --metadata "example_data/metadata.json" \ + --retriever_type "llm_context" + + + python rag_test_manager.py \ + --file ".data" \ + --test_set "example_data/test_set.json" \ + --user_id "97980cfea0068" \ + --params "chunk_size" "search_type", "overlap" \ + --metadata "example_data/metadata.json" \ + --retriever_type "single_document_context" diff --git a/level_3/docker-compose.yml b/level_3/docker-compose.yml index d5bc2269d..982fe9ad1 100644 --- a/level_3/docker-compose.yml +++ b/level_3/docker-compose.yml @@ -51,23 +51,25 @@ services: ports: - "5432:5432" -# superset: -# platform: linux/amd64 -# build: -# context: ./superset -# dockerfile: Dockerfile -# container_name: superset -# environment: -# - ADMIN_USERNAME=admin -# - ADMIN_EMAIL=vasilije@topoteretes.com -# - ADMIN_PASSWORD=admin -# - POSTGRES_USER=bla -# - POSTGRES_PASSWORD=bla -# - POSTGRES_DB=bubu -# networks: -# - promethai_mem_backend -# ports: -# - '8088:8088' + superset: + platform: linux/amd64 + build: + context: ./superset + dockerfile: Dockerfile + container_name: superset + environment: + - ADMIN_USERNAME=admin + - ADMIN_EMAIL=vasilije@topoteretes.com + - ADMIN_PASSWORD=admin + - POSTGRES_USER=bla + - POSTGRES_PASSWORD=bla + - POSTGRES_DB=bubu + networks: + - promethai_mem_backend + ports: + - '8088:8088' + depends_on: + - postgres networks: promethai_mem_backend: diff --git a/level_3/poetry.lock b/level_3/poetry.lock index 15e69a46e..d1a3530d3 100644 --- a/level_3/poetry.lock +++ b/level_3/poetry.lock @@ -6240,4 +6240,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "1700f1cab54ed107d299a47031ad53c58e5e72e3791b9113652fb4ff7854a91a" +content-hash = "e2d17132884b261841ab4ae6dbe1ae0d91a940915a1c99fc47a0d1ed3f920c05" diff --git a/level_3/pyproject.toml b/level_3/pyproject.toml index 60eb074be..cddce6ca2 100644 --- a/level_3/pyproject.toml +++ b/level_3/pyproject.toml @@ -41,6 +41,7 @@ unstructured = {extras = ["pdf"], version = "^0.10.23"} sentence-transformers = "2.2.2" torch = "2.0.*" segment-analytics-python = "^2.2.3" +pdf2image = "^1.16.3" diff --git a/level_3/rag_test_manager.py b/level_3/rag_test_manager.py index 108b98d08..2784e1143 100644 --- a/level_3/rag_test_manager.py +++ b/level_3/rag_test_manager.py @@ -204,7 +204,7 @@ def generate_param_variants( # Default values defaults = { - "chunk_size": 250, + "chunk_size": 750, "chunk_overlap": 20, "similarity_score": 0.5, "metadata_variation": 0, @@ -216,7 +216,7 @@ def generate_param_variants( params = {**defaults, **(base_params or {})} default_increments = { - "chunk_size": 150, + "chunk_size": 250, "chunk_overlap": 10, "similarity_score": 0.1, "metadata_variation": 1, @@ -615,7 +615,10 @@ async def start_test( if loader_settings.get('search_type') == 'bm25': return retrieve_action["data"]["Get"][test_id] else: - return retrieve_action["data"]["Get"][test_id][0]["text"] + try: + return retrieve_action["data"]["Get"][test_id][0]["text"] + except: + return retrieve_action["data"]["Get"][test_id] async def run_eval(test_item, search_result): logging.info("Initiated test set evaluation") @@ -686,7 +689,7 @@ async def start_test( metadata=metadata, retriever_type=retriever_type, ) # No params for this case - results.append([result, "No params"]) + results.append(result) elif retriever_type == "single_document_context": logging.info("Retriever type: single document context") @@ -697,43 +700,80 @@ async def start_test( ) # Add the params to the result # result.append(param) results.append(result) - for b in results: logging.info("Loading %s", str(b)) - for result, chunk in b: - logging.info("Loading %s", str(result)) - await add_entity( - session, - TestOutput( - id=test_id, - test_set_id=test_set_id, - operation_id=job_id, - set_id=str(uuid.uuid4()), - user_id=user_id, - test_results=result["success"], - test_score=str(result["score"]), - test_metric_name=result["metric_name"], - test_query=result["query"], - test_output=result["output"], - test_expected_output=str(["expected_output"]), - test_context=result["context"][0], - test_params=str(chunk), # Add params to the database table - ), - ) - analytics.track(user_id, 'TestOutput', { - 'test_set_id': test_set_id, - 'operation_id': job_id, - 'set_id' : str(uuid.uuid4()), - 'test_results' : result["success"], - 'test_score' : str(result["score"]), - 'test_metric_name' : result["metric_name"], - 'test_query' : result["query"], - 'test_output' : result["output"], - 'test_expected_output' : str(["expected_output"]), - 'test_context' : result["context"][0], - 'test_params' : str(chunk), - }) - analytics.flush() + if retriever_type == "single_document_context": + for result, chunk in b: + logging.info("Loading %s", str(result)) + await add_entity( + session, + TestOutput( + id=test_id, + test_set_id=test_set_id, + operation_id=job_id, + set_id=str(uuid.uuid4()), + user_id=user_id, + test_results=result["success"], + test_score=str(result["score"]), + test_metric_name=result["metric_name"], + test_query=result["query"], + test_output=result["output"], + test_expected_output=str(["expected_output"]), + test_context=result["context"][0], + test_params=str(chunk), # Add params to the database table + ), + ) + analytics.track(user_id, 'TestOutput', { + 'test_set_id': test_set_id, + 'operation_id': job_id, + 'set_id' : str(uuid.uuid4()), + 'test_results' : result["success"], + 'test_score' : str(result["score"]), + 'test_metric_name' : result["metric_name"], + 'test_query' : result["query"], + 'test_output' : result["output"], + 'test_expected_output' : str(["expected_output"]), + 'test_context' : result["context"][0], + 'test_params' : str(chunk), + }) + analytics.flush() + else: + chunk="None" + for result in b: + logging.info("Loading %s", str(result)) + await add_entity( + session, + TestOutput( + id=test_id, + test_set_id=test_set_id, + operation_id=job_id, + set_id=str(uuid.uuid4()), + user_id=user_id, + test_results=result[0]["success"], + test_score=str(result[0]["score"]), + test_metric_name=result[0]["metric_name"], + test_query=result[0]["query"], + test_output=result[0]["output"], + test_expected_output=str(["expected_output"]), + test_context=result[0]["context"][0], + test_params=str(chunk), # Add params to the database table + ), + ) + analytics.track(user_id, 'TestOutput', { + 'test_set_id': test_set_id, + 'operation_id': job_id, + 'set_id' : str(uuid.uuid4()), + 'test_results' : result[0]["success"], + 'test_score' : str(result[0]["score"]), + 'test_metric_name' : result[0]["metric_name"], + 'test_query' : result[0]["query"], + 'test_output' : result[0]["output"], + 'test_expected_output' : str(["expected_output"]), + 'test_context' : result[0]["context"][0], + 'test_params' : str(chunk), + }) + analytics.flush() + await update_entity(session, Operation, job_id, "COMPLETED") diff --git a/level_3/vectordb/chunkers/chunkers.py b/level_3/vectordb/chunkers/chunkers.py index 3019b2478..225b72e87 100644 --- a/level_3/vectordb/chunkers/chunkers.py +++ b/level_3/vectordb/chunkers/chunkers.py @@ -29,7 +29,10 @@ def vanilla_chunker(source_data, chunk_size=100, chunk_overlap=20): chunk_overlap=chunk_overlap, length_function=len ) - pages = text_splitter.create_documents([source_data]) + try: + pages = text_splitter.create_documents([source_data]) + except: + pages = text_splitter.create_documents(source_data.content) # pages = source_data.load_and_split() return pages def chunk_data_exact(data_chunks, chunk_size, chunk_overlap): diff --git a/level_3/vectordb/loaders/loaders.py b/level_3/vectordb/loaders/loaders.py index acb54147e..ce8a13c73 100644 --- a/level_3/vectordb/loaders/loaders.py +++ b/level_3/vectordb/loaders/loaders.py @@ -56,16 +56,20 @@ async def _document_loader( observation: str, loader_settings: dict): if document_format == "PDF": # loader = SimpleDirectoryReader(".data", recursive=True, exclude_hidden=True) documents = loader.load() + pages = chunk_data(chunk_strategy=loader_strategy, source_data=str(documents), chunk_size=chunk_size, + chunk_overlap=chunk_overlap) logging.info("Documents: %s", documents) # pages = documents.load_and_split() - chunked_doc.append(documents) + chunked_doc.append(pages) elif document_format == "TEXT": documents = loader.load() + pages = chunk_data(chunk_strategy=loader_strategy, source_data=str(documents), chunk_size=chunk_size, + chunk_overlap=chunk_overlap) logging.info("Documents: %s", documents) # pages = documents.load_and_split() - chunked_doc.append(documents) + chunked_doc.append(pages) else: raise ValueError(f"Error: ")