Merge pull request #27 from topoteretes/add_async_elements

Add async elements
This commit is contained in:
Vasilije 2023-10-16 16:25:55 +02:00 committed by GitHub
commit 7dcecf985b
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 2793 additions and 1313 deletions

BIN
level_3/.data/3ZCCCW.pdf Normal file

Binary file not shown.

4068
level_3/poetry.lock generated

File diff suppressed because it is too large Load diff

View file

@ -48,6 +48,7 @@ llama-hub = "^0.0.34"
sqlalchemy = "^2.0.21"
asyncpg = "^0.28.0"
dash = "^2.14.0"
unstructured = {extras = ["pdf"], version = "^0.10.23"}

View file

@ -1,4 +1,5 @@
import argparse
import json
import random
import itertools
@ -242,9 +243,9 @@ def data_location_route(data_string: str):
class LocationRoute(Enum):
"""Represents classifier for the data location"""
DEVICE = "DEVICE"
URL = "URL"
DATABASE = "DATABASE"
DEVICE = "file_path_starting_with_.data_or_containing_it"
# URL = "url starting with http or https"
DATABASE = "database_name_like_postgres_or_mysql"
return LocationRoute(data_string).name
@ -283,7 +284,9 @@ async def start_test(data, test_set=None, user_id=None, params=None, job_id=None
if params is None:
data_format = data_format_route(data) # Assume data_format_route is predefined
data_location = data_location_route(data) # Assume data_location_route is predefined
logging.info("Data format is %s", data_format)
data_location = data_location_route(data)
logging.info("Data location is %s",data_location)# Assume data_location_route is predefined
test_params = generate_param_variants(
included_params=['chunk_size'])
@ -429,7 +432,7 @@ async def main():
]
# "https://www.ibiblio.org/ebooks/London/Call%20of%20Wild.pdf"
#http://public-library.uk/ebooks/59/83.pdf
result = await start_test("http://public-library.uk/ebooks/59/83.pdf", test_set=test_set, user_id="676", params=None, metadata=metadata)
result = await start_test(".data/3ZCCCW.pdf", test_set=test_set, user_id="676", params=None, metadata=metadata)
#
# parser = argparse.ArgumentParser(description="Run tests against a document.")
# parser.add_argument("--url", required=True, help="URL of the document to test.")
@ -440,7 +443,7 @@ async def main():
# parser.add_argument("--generate_test_set", required=True, help="Make a test set.")
# parser.add_argument("--only_llm_context", required=True, help="Do a test only within the existing LLM context")
# args = parser.parse_args()
#
# try:
# with open(args.test_set, "r") as file:
# test_set = json.load(file)

View file

@ -7,6 +7,8 @@ sys.path.append(os.path.dirname(os.path.abspath(__file__)))
from vectordb.chunkers.chunkers import chunk_data
from llama_hub.file.base import SimpleDirectoryReader
from langchain.document_loaders import DirectoryLoader
import requests
async def _document_loader( observation: str, loader_settings: dict):
# Check the format of the document
@ -28,12 +30,20 @@ async def _document_loader( observation: str, loader_settings: dict):
pages = chunk_data(chunk_strategy= loader_strategy, source_data=file_content, chunk_size=chunk_size, chunk_overlap=chunk_overlap)
return pages
elif loader_settings.get("source") == "file":
elif loader_settings.get("source") == "DEVICE":
import os
loader = SimpleDirectoryReader('./data', recursive=True, exclude_hidden=True)
documents = loader.load_data()
pages = documents.load_and_split()
return pages
current_directory = os.getcwd()
import logging
logging.info("Current Directory: %s", current_directory)
loader = DirectoryLoader(".data", recursive=True)
# loader = SimpleDirectoryReader(".data", recursive=True, exclude_hidden=True)
documents = loader.load()
logging.info("Documents: %s", documents)
# pages = documents.load_and_split()
return documents
elif document_format == "text":
pages = chunk_data(chunk_strategy= loader_strategy, source_data=observation, chunk_size=chunk_size, chunk_overlap=chunk_overlap)