Added load from .data folder
This commit is contained in:
parent
75aded58b7
commit
cccc87b05c
5 changed files with 2868 additions and 1389 deletions
BIN
level_3/.data/3ZCCCW.pdf
Normal file
BIN
level_3/.data/3ZCCCW.pdf
Normal file
Binary file not shown.
4068
level_3/poetry.lock
generated
4068
level_3/poetry.lock
generated
File diff suppressed because it is too large
Load diff
|
|
@ -48,6 +48,7 @@ llama-hub = "^0.0.34"
|
|||
sqlalchemy = "^2.0.21"
|
||||
asyncpg = "^0.28.0"
|
||||
dash = "^2.14.0"
|
||||
unstructured = {extras = ["pdf"], version = "^0.10.23"}
|
||||
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -243,9 +243,9 @@ def data_location_route(data_string: str):
|
|||
class LocationRoute(Enum):
|
||||
"""Represents classifier for the data location"""
|
||||
|
||||
DEVICE = "DEVICE"
|
||||
URL = "URL"
|
||||
DATABASE = "DATABASE"
|
||||
DEVICE = "file_path_starting_with_.data_or_containing_it"
|
||||
# URL = "url starting with http or https"
|
||||
DATABASE = "database_name_like_postgres_or_mysql"
|
||||
|
||||
return LocationRoute(data_string).name
|
||||
|
||||
|
|
@ -284,7 +284,9 @@ async def start_test(data, test_set=None, user_id=None, params=None, job_id=None
|
|||
|
||||
if params is None:
|
||||
data_format = data_format_route(data) # Assume data_format_route is predefined
|
||||
data_location = data_location_route(data) # Assume data_location_route is predefined
|
||||
logging.info("Data format is %s", data_format)
|
||||
data_location = data_location_route(data)
|
||||
logging.info("Data location is %s",data_location)# Assume data_location_route is predefined
|
||||
test_params = generate_param_variants(
|
||||
included_params=['chunk_size'])
|
||||
|
||||
|
|
@ -392,86 +394,86 @@ async def start_test(data, test_set=None, user_id=None, params=None, job_id=None
|
|||
|
||||
async def main():
|
||||
|
||||
# metadata = {
|
||||
# "version": "1.0",
|
||||
# "agreement_id": "AG123456",
|
||||
# "privacy_policy": "https://example.com/privacy",
|
||||
# "terms_of_service": "https://example.com/terms",
|
||||
# "format": "json",
|
||||
# "schema_version": "1.1",
|
||||
# "checksum": "a1b2c3d4e5f6",
|
||||
# "owner": "John Doe",
|
||||
# "license": "MIT",
|
||||
# "validity_start": "2023-08-01",
|
||||
# "validity_end": "2024-07-31",
|
||||
# }
|
||||
metadata = {
|
||||
"version": "1.0",
|
||||
"agreement_id": "AG123456",
|
||||
"privacy_policy": "https://example.com/privacy",
|
||||
"terms_of_service": "https://example.com/terms",
|
||||
"format": "json",
|
||||
"schema_version": "1.1",
|
||||
"checksum": "a1b2c3d4e5f6",
|
||||
"owner": "John Doe",
|
||||
"license": "MIT",
|
||||
"validity_start": "2023-08-01",
|
||||
"validity_end": "2024-07-31",
|
||||
}
|
||||
|
||||
test_set = [
|
||||
{
|
||||
"question": "Who is the main character in 'The Call of the Wild'?",
|
||||
"answer": "Buck"
|
||||
},
|
||||
{
|
||||
"question": "Who wrote 'The Call of the Wild'?",
|
||||
"answer": "Jack London"
|
||||
},
|
||||
{
|
||||
"question": "Where does Buck live at the start of the book?",
|
||||
"answer": "In the Santa Clara Valley, at Judge Miller’s place."
|
||||
},
|
||||
{
|
||||
"question": "Why is Buck kidnapped?",
|
||||
"answer": "He is kidnapped to be sold as a sled dog in the Yukon during the Klondike Gold Rush."
|
||||
},
|
||||
{
|
||||
"question": "How does Buck become the leader of the sled dog team?",
|
||||
"answer": "Buck becomes the leader after defeating the original leader, Spitz, in a fight."
|
||||
}
|
||||
]
|
||||
# "https://www.ibiblio.org/ebooks/London/Call%20of%20Wild.pdf"
|
||||
#http://public-library.uk/ebooks/59/83.pdf
|
||||
result = await start_test(".data/3ZCCCW.pdf", test_set=test_set, user_id="676", params=None, metadata=metadata)
|
||||
#
|
||||
# test_set = [
|
||||
# {
|
||||
# "question": "Who is the main character in 'The Call of the Wild'?",
|
||||
# "answer": "Buck"
|
||||
# },
|
||||
# {
|
||||
# "question": "Who wrote 'The Call of the Wild'?",
|
||||
# "answer": "Jack London"
|
||||
# },
|
||||
# {
|
||||
# "question": "Where does Buck live at the start of the book?",
|
||||
# "answer": "In the Santa Clara Valley, at Judge Miller’s place."
|
||||
# },
|
||||
# {
|
||||
# "question": "Why is Buck kidnapped?",
|
||||
# "answer": "He is kidnapped to be sold as a sled dog in the Yukon during the Klondike Gold Rush."
|
||||
# },
|
||||
# {
|
||||
# "question": "How does Buck become the leader of the sled dog team?",
|
||||
# "answer": "Buck becomes the leader after defeating the original leader, Spitz, in a fight."
|
||||
# }
|
||||
# ]
|
||||
# # "https://www.ibiblio.org/ebooks/London/Call%20of%20Wild.pdf"
|
||||
# #http://public-library.uk/ebooks/59/83.pdf
|
||||
# result = await start_test("http://public-library.uk/ebooks/59/83.pdf", test_set=test_set, user_id="676", params=None, metadata=metadata)
|
||||
# #
|
||||
parser = argparse.ArgumentParser(description="Run tests against a document.")
|
||||
parser.add_argument("--url", required=True, help="URL of the document to test.")
|
||||
parser.add_argument("--test_set", required=True, help="Path to JSON file containing the test set.")
|
||||
parser.add_argument("--user_id", required=True, help="User ID.")
|
||||
parser.add_argument("--params", help="Additional parameters in JSON format.")
|
||||
parser.add_argument("--metadata", required=True, help="Path to JSON file containing metadata.")
|
||||
parser.add_argument("--generate_test_set", required=True, help="Make a test set.")
|
||||
parser.add_argument("--only_llm_context", required=True, help="Do a test only within the existing LLM context")
|
||||
args = parser.parse_args()
|
||||
|
||||
try:
|
||||
with open(args.test_set, "r") as file:
|
||||
test_set = json.load(file)
|
||||
if not isinstance(test_set, list): # Expecting a list
|
||||
raise TypeError("Parsed test_set JSON is not a list.")
|
||||
except Exception as e:
|
||||
print(f"Error loading test_set: {str(e)}")
|
||||
return
|
||||
|
||||
try:
|
||||
with open(args.metadata, "r") as file:
|
||||
metadata = json.load(file)
|
||||
if not isinstance(metadata, dict):
|
||||
raise TypeError("Parsed metadata JSON is not a dictionary.")
|
||||
except Exception as e:
|
||||
print(f"Error loading metadata: {str(e)}")
|
||||
return
|
||||
|
||||
if args.params:
|
||||
try:
|
||||
params = json.loads(args.params)
|
||||
if not isinstance(params, dict):
|
||||
raise TypeError("Parsed params JSON is not a dictionary.")
|
||||
except json.JSONDecodeError as e:
|
||||
print(f"Error parsing params: {str(e)}")
|
||||
return
|
||||
else:
|
||||
params = None
|
||||
#clean up params here
|
||||
await start_test(args.url, test_set, args.user_id, params=None, metadata=metadata)
|
||||
# parser = argparse.ArgumentParser(description="Run tests against a document.")
|
||||
# parser.add_argument("--url", required=True, help="URL of the document to test.")
|
||||
# parser.add_argument("--test_set", required=True, help="Path to JSON file containing the test set.")
|
||||
# parser.add_argument("--user_id", required=True, help="User ID.")
|
||||
# parser.add_argument("--params", help="Additional parameters in JSON format.")
|
||||
# parser.add_argument("--metadata", required=True, help="Path to JSON file containing metadata.")
|
||||
# parser.add_argument("--generate_test_set", required=True, help="Make a test set.")
|
||||
# parser.add_argument("--only_llm_context", required=True, help="Do a test only within the existing LLM context")
|
||||
# args = parser.parse_args()
|
||||
#
|
||||
# try:
|
||||
# with open(args.test_set, "r") as file:
|
||||
# test_set = json.load(file)
|
||||
# if not isinstance(test_set, list): # Expecting a list
|
||||
# raise TypeError("Parsed test_set JSON is not a list.")
|
||||
# except Exception as e:
|
||||
# print(f"Error loading test_set: {str(e)}")
|
||||
# return
|
||||
#
|
||||
# try:
|
||||
# with open(args.metadata, "r") as file:
|
||||
# metadata = json.load(file)
|
||||
# if not isinstance(metadata, dict):
|
||||
# raise TypeError("Parsed metadata JSON is not a dictionary.")
|
||||
# except Exception as e:
|
||||
# print(f"Error loading metadata: {str(e)}")
|
||||
# return
|
||||
#
|
||||
# if args.params:
|
||||
# try:
|
||||
# params = json.loads(args.params)
|
||||
# if not isinstance(params, dict):
|
||||
# raise TypeError("Parsed params JSON is not a dictionary.")
|
||||
# except json.JSONDecodeError as e:
|
||||
# print(f"Error parsing params: {str(e)}")
|
||||
# return
|
||||
# else:
|
||||
# params = None
|
||||
# #clean up params here
|
||||
# await start_test(args.url, test_set, args.user_id, params=None, metadata=metadata)
|
||||
if __name__ == "__main__":
|
||||
|
||||
asyncio.run(main())
|
||||
|
|
|
|||
|
|
@ -7,6 +7,8 @@ sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
|||
from vectordb.chunkers.chunkers import chunk_data
|
||||
from llama_hub.file.base import SimpleDirectoryReader
|
||||
|
||||
from langchain.document_loaders import DirectoryLoader
|
||||
|
||||
import requests
|
||||
async def _document_loader( observation: str, loader_settings: dict):
|
||||
# Check the format of the document
|
||||
|
|
@ -28,12 +30,20 @@ async def _document_loader( observation: str, loader_settings: dict):
|
|||
pages = chunk_data(chunk_strategy= loader_strategy, source_data=file_content, chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
||||
|
||||
return pages
|
||||
elif loader_settings.get("source") == "file":
|
||||
elif loader_settings.get("source") == "DEVICE":
|
||||
import os
|
||||
|
||||
loader = SimpleDirectoryReader('./data', recursive=True, exclude_hidden=True)
|
||||
documents = loader.load_data()
|
||||
pages = documents.load_and_split()
|
||||
return pages
|
||||
current_directory = os.getcwd()
|
||||
import logging
|
||||
logging.info("Current Directory: %s", current_directory)
|
||||
|
||||
loader = DirectoryLoader(".data", recursive=True)
|
||||
|
||||
# loader = SimpleDirectoryReader(".data", recursive=True, exclude_hidden=True)
|
||||
documents = loader.load()
|
||||
logging.info("Documents: %s", documents)
|
||||
# pages = documents.load_and_split()
|
||||
return documents
|
||||
|
||||
elif document_format == "text":
|
||||
pages = chunk_data(chunk_strategy= loader_strategy, source_data=observation, chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue