Added load from .data folder

2023-10-16 15:25:17 +01:00 · 2023-10-16 15:25:17 +01:00 · cccc87b05c
commit cccc87b05c
parent 75aded58b7
5 changed files with 2868 additions and 1389 deletions
--- a/level_3/.data/3ZCCCW.pdf
+++ b/level_3/.data/3ZCCCW.pdf
--- a/level_3/poetry.lock
+++ b/level_3/poetry.lock
--- a/level_3/pyproject.toml
+++ b/level_3/pyproject.toml
@ -48,6 +48,7 @@ llama-hub = "^0.0.34"
 sqlalchemy = "^2.0.21"
 asyncpg = "^0.28.0"
 dash = "^2.14.0"
+unstructured = {extras = ["pdf"], version = "^0.10.23"}



--- a/level_3/rag_test_manager.py
+++ b/level_3/rag_test_manager.py
@ -243,9 +243,9 @@ def data_location_route(data_string: str):
    class LocationRoute(Enum):
        """Represents classifier for the data location"""

-        DEVICE = "DEVICE"
-        URL = "URL"
-        DATABASE = "DATABASE"
+        DEVICE = "file_path_starting_with_.data_or_containing_it"
+        # URL = "url starting with http or https"
+        DATABASE = "database_name_like_postgres_or_mysql"

    return LocationRoute(data_string).name

@ -284,7 +284,9 @@ async def start_test(data, test_set=None, user_id=None, params=None, job_id=None

        if params is None:
            data_format = data_format_route(data)  # Assume data_format_route is predefined
-            data_location = data_location_route(data)  # Assume data_location_route is predefined
+            logging.info("Data format is %s", data_format)
+            data_location = data_location_route(data)
+            logging.info("Data location is %s",data_location)# Assume data_location_route is predefined
            test_params = generate_param_variants(
                included_params=['chunk_size'])

@ -392,86 +394,86 @@ async def start_test(data, test_set=None, user_id=None, params=None, job_id=None

 async def main():

-    # metadata = {
-    #     "version": "1.0",
-    #     "agreement_id": "AG123456",
-    #     "privacy_policy": "https://example.com/privacy",
-    #     "terms_of_service": "https://example.com/terms",
-    #     "format": "json",
-    #     "schema_version": "1.1",
-    #     "checksum": "a1b2c3d4e5f6",
-    #     "owner": "John Doe",
-    #     "license": "MIT",
-    #     "validity_start": "2023-08-01",
-    #     "validity_end": "2024-07-31",
-    # }
+    metadata = {
+        "version": "1.0",
+        "agreement_id": "AG123456",
+        "privacy_policy": "https://example.com/privacy",
+        "terms_of_service": "https://example.com/terms",
+        "format": "json",
+        "schema_version": "1.1",
+        "checksum": "a1b2c3d4e5f6",
+        "owner": "John Doe",
+        "license": "MIT",
+        "validity_start": "2023-08-01",
+        "validity_end": "2024-07-31",
+    }
+
+    test_set = [
+        {
+            "question": "Who is the main character in 'The Call of the Wild'?",
+            "answer": "Buck"
+        },
+        {
+            "question": "Who wrote 'The Call of the Wild'?",
+            "answer": "Jack London"
+        },
+        {
+            "question": "Where does Buck live at the start of the book?",
+            "answer": "In the Santa Clara Valley, at Judge Miller’s place."
+        },
+        {
+            "question": "Why is Buck kidnapped?",
+            "answer": "He is kidnapped to be sold as a sled dog in the Yukon during the Klondike Gold Rush."
+        },
+        {
+            "question": "How does Buck become the leader of the sled dog team?",
+            "answer": "Buck becomes the leader after defeating the original leader, Spitz, in a fight."
+        }
+    ]
+    # "https://www.ibiblio.org/ebooks/London/Call%20of%20Wild.pdf"
+    #http://public-library.uk/ebooks/59/83.pdf
+    result = await start_test(".data/3ZCCCW.pdf", test_set=test_set, user_id="676", params=None, metadata=metadata)
    #
-    # test_set = [
-    #     {
-    #         "question": "Who is the main character in 'The Call of the Wild'?",
-    #         "answer": "Buck"
-    #     },
-    #     {
-    #         "question": "Who wrote 'The Call of the Wild'?",
-    #         "answer": "Jack London"
-    #     },
-    #     {
-    #         "question": "Where does Buck live at the start of the book?",
-    #         "answer": "In the Santa Clara Valley, at Judge Miller’s place."
-    #     },
-    #     {
-    #         "question": "Why is Buck kidnapped?",
-    #         "answer": "He is kidnapped to be sold as a sled dog in the Yukon during the Klondike Gold Rush."
-    #     },
-    #     {
-    #         "question": "How does Buck become the leader of the sled dog team?",
-    #         "answer": "Buck becomes the leader after defeating the original leader, Spitz, in a fight."
-    #     }
-    # ]
-    # # "https://www.ibiblio.org/ebooks/London/Call%20of%20Wild.pdf"
-    # #http://public-library.uk/ebooks/59/83.pdf
-    # result = await start_test("http://public-library.uk/ebooks/59/83.pdf", test_set=test_set, user_id="676", params=None, metadata=metadata)
-    # #
-    parser = argparse.ArgumentParser(description="Run tests against a document.")
-    parser.add_argument("--url", required=True, help="URL of the document to test.")
-    parser.add_argument("--test_set", required=True, help="Path to JSON file containing the test set.")
-    parser.add_argument("--user_id", required=True, help="User ID.")
-    parser.add_argument("--params", help="Additional parameters in JSON format.")
-    parser.add_argument("--metadata", required=True, help="Path to JSON file containing metadata.")
-    parser.add_argument("--generate_test_set", required=True, help="Make a test set.")
-    parser.add_argument("--only_llm_context", required=True, help="Do a test only within the existing LLM context")
-    args = parser.parse_args()
-
-    try:
-        with open(args.test_set, "r") as file:
-            test_set = json.load(file)
-            if not isinstance(test_set, list):  # Expecting a list
-                raise TypeError("Parsed test_set JSON is not a list.")
-    except Exception as e:
-        print(f"Error loading test_set: {str(e)}")
-        return
-
-    try:
-        with open(args.metadata, "r") as file:
-            metadata = json.load(file)
-            if not isinstance(metadata, dict):
-                raise TypeError("Parsed metadata JSON is not a dictionary.")
-    except Exception as e:
-        print(f"Error loading metadata: {str(e)}")
-        return
-
-    if args.params:
-        try:
-            params = json.loads(args.params)
-            if not isinstance(params, dict):
-                raise TypeError("Parsed params JSON is not a dictionary.")
-        except json.JSONDecodeError as e:
-            print(f"Error parsing params: {str(e)}")
-            return
-    else:
-        params = None
-    #clean up params here
-    await start_test(args.url, test_set, args.user_id, params=None, metadata=metadata)
+    # parser = argparse.ArgumentParser(description="Run tests against a document.")
+    # parser.add_argument("--url", required=True, help="URL of the document to test.")
+    # parser.add_argument("--test_set", required=True, help="Path to JSON file containing the test set.")
+    # parser.add_argument("--user_id", required=True, help="User ID.")
+    # parser.add_argument("--params", help="Additional parameters in JSON format.")
+    # parser.add_argument("--metadata", required=True, help="Path to JSON file containing metadata.")
+    # parser.add_argument("--generate_test_set", required=True, help="Make a test set.")
+    # parser.add_argument("--only_llm_context", required=True, help="Do a test only within the existing LLM context")
+    # args = parser.parse_args()
+    #
+    # try:
+    #     with open(args.test_set, "r") as file:
+    #         test_set = json.load(file)
+    #         if not isinstance(test_set, list):  # Expecting a list
+    #             raise TypeError("Parsed test_set JSON is not a list.")
+    # except Exception as e:
+    #     print(f"Error loading test_set: {str(e)}")
+    #     return
+    #
+    # try:
+    #     with open(args.metadata, "r") as file:
+    #         metadata = json.load(file)
+    #         if not isinstance(metadata, dict):
+    #             raise TypeError("Parsed metadata JSON is not a dictionary.")
+    # except Exception as e:
+    #     print(f"Error loading metadata: {str(e)}")
+    #     return
+    #
+    # if args.params:
+    #     try:
+    #         params = json.loads(args.params)
+    #         if not isinstance(params, dict):
+    #             raise TypeError("Parsed params JSON is not a dictionary.")
+    #     except json.JSONDecodeError as e:
+    #         print(f"Error parsing params: {str(e)}")
+    #         return
+    # else:
+    #     params = None
+    # #clean up params here
+    # await start_test(args.url, test_set, args.user_id, params=None, metadata=metadata)
 if __name__ == "__main__":

    asyncio.run(main())
--- a/level_3/vectordb/loaders/loaders.py
+++ b/level_3/vectordb/loaders/loaders.py
@ -7,6 +7,8 @@ sys.path.append(os.path.dirname(os.path.abspath(__file__)))
 from vectordb.chunkers.chunkers import chunk_data
 from llama_hub.file.base import SimpleDirectoryReader

+from langchain.document_loaders import DirectoryLoader
+
 import requests
 async def _document_loader( observation: str, loader_settings: dict):
    # Check the format of the document
@ -28,12 +30,20 @@ async def _document_loader( observation: str, loader_settings: dict):
            pages = chunk_data(chunk_strategy= loader_strategy, source_data=file_content, chunk_size=chunk_size, chunk_overlap=chunk_overlap)

            return pages
-        elif loader_settings.get("source") == "file":
+        elif loader_settings.get("source") == "DEVICE":
+            import os

-            loader = SimpleDirectoryReader('./data', recursive=True, exclude_hidden=True)
-            documents = loader.load_data()
-            pages = documents.load_and_split()
-            return pages
+            current_directory = os.getcwd()
+            import logging
+            logging.info("Current Directory: %s", current_directory)
+
+            loader = DirectoryLoader(".data", recursive=True)
+
+            # loader = SimpleDirectoryReader(".data", recursive=True, exclude_hidden=True)
+            documents = loader.load()
+            logging.info("Documents: %s", documents)
+            # pages = documents.load_and_split()
+            return documents

    elif document_format == "text":
        pages = chunk_data(chunk_strategy= loader_strategy, source_data=observation, chunk_size=chunk_size, chunk_overlap=chunk_overlap)