added fix

2023-08-16 18:31:48 +02:00 · 2023-08-16 18:31:48 +02:00 · 6c6476f729
commit 6c6476f729
parent 6a1f44f564
6 changed files with 583 additions and 0 deletions
--- a/level_1/level_1_pdf_vectorstore_dlt_etl.py
+++ b/level_1/level_1_pdf_vectorstore_dlt_etl.py
@ -0,0 +1,403 @@
 #Make sure to install the following packages: dlt, langchain, duckdb, python-dotenv, openai, weaviate-client
 import dlt
 from langchain import PromptTemplate, LLMChain
 from langchain.chains.openai_functions import create_structured_output_chain
 from langchain.chat_models import ChatOpenAI
 from langchain.document_loaders import PyPDFLoader
 import weaviate
 import os
 import json
 import argparse
 from langchain.embeddings import OpenAIEmbeddings
 from langchain.prompts import HumanMessagePromptTemplate, ChatPromptTemplate
 from langchain.retrievers import WeaviateHybridSearchRetriever
 from langchain.schema import Document, SystemMessage, HumanMessage
 from langchain.vectorstores import Weaviate
 import uuid
 from dotenv import load_dotenv
 load_dotenv()
 from pathlib import Path
 from langchain import OpenAI, LLMMathChain
 import os
 embeddings = OpenAIEmbeddings()
 from deep_translator import (GoogleTranslator)
 def _convert_pdf_to_document(path: str = None):
    """Convert a PDF document to a Document object"""
    if path is None:
        raise ValueError("A valid path to the document must be provided.")
    loader = PyPDFLoader(path)
    pages = loader.load_and_split()
    print("PAGES", pages[0])
    # Parse metadata from the folder path
    path_parts = Path(path).parts
    personal_receipts_index = path_parts.index("personal_receipts")
    metadata_parts = path_parts[personal_receipts_index+1:]
    documents = []
    for page in pages:
        translation = GoogleTranslator(source='auto', target='en').translate(text=page.page_content)
        documents.append(
            Document(
                metadata={
                    "title": "Personal Receipt",
                    "country": metadata_parts[1],
                    "year": metadata_parts[0],
                    "author": str(uuid.uuid4()),
                    "source": "/".join(metadata_parts),
                },
                page_content=translation,
            )
        )
    print(documents)
    return documents
 def _init_weaviate():
    """Initialize weaviate client and retriever"""
    auth_config = weaviate.auth.AuthApiKey(api_key=os.environ.get('WEAVIATE_API_KEY'))
    client = weaviate.Client(
        url='https://my-vev-index-o4qitptw.weaviate.network',
        auth_client_secret=auth_config,
        additional_headers={
            "X-OpenAI-Api-Key": os.environ.get('OPENAI_API_KEY')
        }
    )
    retriever = WeaviateHybridSearchRetriever(
        client=client,
        index_name="PDFloader",
        text_key="text",
        attributes=[],
        embedding=embeddings,
        create_schema_if_missing=True,
    )
    return retriever
 def load_to_weaviate(document_path=None):
    """Load documents to weaviate"""
    retriever =_init_weaviate()
    docs = _convert_pdf_to_document(document_path)
    return retriever.add_documents(docs)
 def get_from_weaviate(query=None, path=None, operator=None, valueText=None):
    """
    Get documents from weaviate.
    Args:
        query (str): The query string.
        path (list): The path for filtering, e.g., ['year'].
        operator (str): The operator for filtering, e.g., 'Equal'.
        valueText (str): The value for filtering, e.g., '2017*'.
    Example:
        get_from_weaviate(query="some query", path=['year'], operator='Equal', valueText='2017*')
    """
    retriever = _init_weaviate()
    # Initial retrieval without filters
    output = retriever.get_relevant_documents(
        query,
        score=True,
    )
    # Apply filters if provided
    if path or operator or valueText:
        # Create the where_filter based on provided parameters
        where_filter = {
            'path': path if path else [],
            'operator': operator if operator else '',
            'valueText': valueText if valueText else ''
        }
        # Retrieve documents with filters applied
        output = retriever.get_relevant_documents(
            query,
            score=True,
            where_filter=where_filter
        )
    return output
 def delete_from_weaviate(query=None, filters=None):
    """Delete documents from weaviate, pass dict as filters"""
    """  {
        'path': ['year'],
        'operator': 'Equal',
        'valueText': '2017*'     }"""
    auth_config = weaviate.auth.AuthApiKey(api_key=os.environ.get('WEAVIATE_API_KEY'))
    client = weaviate.Client(
        url='https://my-vev-index-o4qitptw.weaviate.network',
        auth_client_secret=auth_config,
        additional_headers={
            "X-OpenAI-Api-Key": os.environ.get('OPENAI_API_KEY')
        }
    )
    client.batch.delete_objects(
        class_name='PDFloader',
        # Same `where` filter as in the GraphQL API
        where={
            'path': ['year'],
            'operator': 'Equal',
            'valueText': '2017*'
        },
    )
    return "Success"
 llm = ChatOpenAI(
            temperature=0.0,
            max_tokens=1200,
            openai_api_key=os.environ.get('OPENAI_API_KEY'),
            model_name="gpt-4-0613",
        )
 def infer_schema_from_text(text: str):
    """Infer schema from text"""
    prompt_ = """ You are a json schema master. Create a JSON schema based on the following data and don't write anything else: {prompt} """
    complete_query = PromptTemplate(
    input_variables=["prompt"],
    template=prompt_,
 )
    chain = LLMChain(
        llm=llm, prompt=complete_query, verbose=True
    )
    chain_result = chain.run(prompt=text).strip()
    json_data = json.dumps(chain_result)
    return json_data
 def set_data_contract(data, version, date, agreement_id=None, privacy_policy=None, terms_of_service=None, format=None, schema_version=None, checksum=None, owner=None, license=None, validity_start=None, validity_end=None):
    # Creating the generic data contract
    data_contract = {
        "version": version or "",
        "date": date or "",
        "agreement_id": agreement_id or "",
        "privacy_policy": privacy_policy or "",
        "terms_of_service": terms_of_service or "",
        "format": format or "",
        "schema_version": schema_version or "",
        "checksum": checksum or "",
        "owner": owner or "",
        "license": license or "",
        "validity_start": validity_start or "",
        "validity_end": validity_end or "",
        "properties": data  # Adding the given data under the "properties" field
    }
    return data_contract
 def create_id_dict(memory_id=None, st_memory_id=None, buffer_id=None):
    """
    Create a dictionary containing IDs for memory, st_memory, and buffer.
    Args:
        memory_id (str): The Memory ID.
        st_memory_id (str): The St_memory ID.
        buffer_id (str): The Buffer ID.
    Returns:
        dict: A dictionary containing the IDs.
    """
    id_dict = {
        "memoryID": memory_id or "",
        "st_MemoryID": st_memory_id or "",
        "bufferID": buffer_id or ""
    }
    return id_dict
 def init_buffer(data, version, date, memory_id=None, st_memory_id=None, buffer_id=None, agreement_id=None, privacy_policy=None, terms_of_service=None, format=None, schema_version=None, checksum=None, owner=None, license=None, validity_start=None, validity_end=None, text=None, process=None):
    # Create ID dictionary
    id_dict = create_id_dict(memory_id, st_memory_id, buffer_id)
    # Set data contract
    data_contract = set_data_contract(data, version, date, agreement_id, privacy_policy, terms_of_service, format, schema_version, checksum, owner, license, validity_start, validity_end)
    # Add ID dictionary to properties
    data_contract["properties"]["relations"] = id_dict
    # Infer schema from text and add to properties
    if text:
        schema = infer_schema_from_text(text)
        data_contract["properties"]["schema"] = schema
    if process:
        data_contract["properties"]["process"] = process
    return data_contract
 def infer_properties_from_text(text: str):
    """Infer schema properties from text"""
    prompt_ = """ You are a json index master. Create a short JSON index containing the most important data and don't write anything else: {prompt} """
    complete_query = PromptTemplate(
    input_variables=["prompt"],
    template=prompt_,
 )
    chain = LLMChain(
        llm=llm, prompt=complete_query, verbose=True
    )
    chain_result = chain.run(prompt=text).strip()
    # json_data = json.dumps(chain_result)
    return chain_result
 #
 #
 # # print(infer_schema_from_text(output[0].page_content))
 def load_json_or_infer_schema(file_path, document_path):
    """Load JSON schema from file or infer schema from text"""
    try:
        # Attempt to load the JSON file
        with open(file_path, 'r') as file:
            json_schema = json.load(file)
        return json_schema
    except FileNotFoundError:
        # If the file doesn't exist, run the specified function
        output = _convert_pdf_to_document(path=document_path)
        json_schema = infer_schema_from_text(output[0].page_content)
        return json_schema
 def ai_function(prompt=None, json_schema=None):
    """AI function to convert unstructured data to structured data"""
    # Here we define the user prompt and the structure of the output we desire
    # prompt = output[0].page_content
    prompt_msgs = [
        SystemMessage(
            content="You are a world class algorithm converting unstructured data into structured data."
        ),
        HumanMessage(content="Convert unstructured data to structured data:"),
        HumanMessagePromptTemplate.from_template("{input}"),
        HumanMessage(content="Tips: Make sure to answer in the correct format"),
    ]
    prompt_ = ChatPromptTemplate(messages=prompt_msgs)
    chain = create_structured_output_chain(json_schema , prompt=prompt_, llm=llm, verbose=True)
    output = chain.run(input = prompt, llm=llm)
    yield output
 # Define a base directory if you have one; this could be the directory where your script is located
 BASE_DIR = os.path.dirname(os.path.abspath(__file__))
 def higher_level_thinking():
    """Higher level thinking function to calculate the sum of the price of the tickets from these documents"""
    docs_data = get_from_weaviate(query="Train", path=['year'], operator='Equal', valueText='2017*')
    str_docs_data = str(docs_data)
    llm_math = LLMMathChain.from_llm(llm, verbose=True)
    output = llm_math.run(f"Calculate the sum of the price of the tickets from these documents: {str_docs_data}")
    # data_format = init_buffer(data=output, version="0.0.1", date="2021-09-01")
    yield output
 result_higher_level_thinking = higher_level_thinking()
 def process_higher_level_thinking(result=None):
    data_format = init_buffer(data=result, version="0.0.1", date="2021-09-01")
    import json
    data_format=json.dumps(data_format)
    yield data_format
 document_paths = [
    os.path.join(BASE_DIR, "personal_receipts", "2017", "de", "public_transport", "3ZCCCW.pdf"),
    os.path.join(BASE_DIR, "personal_receipts", "2017", "de", "public_transport", "4GBEC9.pdf")
 ]
 def main(raw_loading, processed_loading,document_paths):
    BASE_DIR = os.getcwd()  # Assuming the current working directory is where the data_processing_script.py is located
    def format_document_paths(base_dir, path):
        # Split the input path and extract the elements
        elements = path.strip("/").split("/")
        # Construct the document_paths list
        document_paths = [os.path.join(base_dir, *elements)]
        return document_paths
    document_paths_ =[format_document_paths(BASE_DIR, path) for path in document_paths][0]
    print(document_paths)
    if raw_loading:
        for document in document_paths_:
            file_path = os.path.join(BASE_DIR, "ticket_schema.json")
            json_schema = load_json_or_infer_schema(file_path, document)
            output = _convert_pdf_to_document(path=document)
            find_data_in_store = get_from_weaviate(query="Train", path=['year'], operator='Equal', valueText='2017*')
            if find_data_in_store:
                output = find_data_in_store
                print(output[1])
            else:
                load_to_weaviate(document)
            pipeline = dlt.pipeline(pipeline_name="train_ticket", destination='duckdb', dataset_name='train_ticket_data')
            info = pipeline.run(data=ai_function(output[0].page_content, json_schema))
            print(info)
    elif processed_loading:
        pipeline_processed = dlt.pipeline(pipeline_name="train_ticket_processed", destination='duckdb',
                                          dataset_name='train_ticket_processed_data')
        info = pipeline_processed.run(data=higher_level_thinking())
        print(info)
    else:
        print("Please specify either '--raw_loading' or '--processed_loading' option.")
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Data Processing Script")
    parser.add_argument("--raw_loading", action="store_true", help="Load raw document data and perform AI tasks")
    parser.add_argument("--processed_loading", action="store_true",
                        help="Load processed data and run higher-level thinking AI function")
    parser.add_argument("document_paths", nargs="*", help="Paths to the documents to process")
    args = parser.parse_args()
    main(args.raw_loading, args.processed_loading, args.document_paths)
 #TO RUN:  python3 level_1_pdf_vectorstore_dlt_etl.py --raw_loading "/personal_receipts/2017/de/public_transport/3ZCCCW.pdf"
--- a/level_1/personal_receipts/2017/de/public_transport/118NP8.pdf
+++ b/level_1/personal_receipts/2017/de/public_transport/118NP8.pdf
--- a/level_1/personal_receipts/2017/de/public_transport/3ZCCCW.pdf
+++ b/level_1/personal_receipts/2017/de/public_transport/3ZCCCW.pdf
--- a/level_1/personal_receipts/2017/de/public_transport/4GBEC9.pdf
+++ b/level_1/personal_receipts/2017/de/public_transport/4GBEC9.pdf
--- a/level_1/personal_receipts/2017/de/public_transport/96W2GF.pdf
+++ b/level_1/personal_receipts/2017/de/public_transport/96W2GF.pdf
--- a/level_1/ticket_schema.json
+++ b/level_1/ticket_schema.json
@ -0,0 +1,180 @@
 {
  "$schema": "http://json-schema.org/draft-07/schema#",
  "type": "object",
  "properties": {
    "ticketType": {
      "type": "string",
      "enum": ["online ticket", "ICE ticket"]
    },
    "departureDate": {
      "type": "string",
      "format": "date"
    },
    "priceType": {
      "type": "string",
      "enum": ["Flex price (single journey)"]
    },
    "class": {
      "type": "integer",
      "enum": [1]
    },
    "adult": {
      "type": "object",
      "properties": {
        "quantity": {
          "type": "integer"
        },
        "BC50": {
          "type": "integer"
        }
      },
      "required": ["quantity", "BC50"]
    },
    "journey": {
      "type": "object",
      "properties": {
        "from": {
          "type": "string"
        },
        "to": {
          "type": "string"
        },
        "via": {
          "type": "string"
        },
        "train": {
          "type": "string",
          "enum": ["ICE"]
        }
      },
      "required": ["from", "to", "via", "train"]
    },
    "refundPolicy": {
      "type": "string"
    },
    "payment": {
      "type": "object",
      "properties": {
        "items": {
          "type": "array",
          "items": {
            "type": "object",
            "properties": {
              "name": {
                "type": "string"
              },
              "quantity": {
                "type": "integer"
              },
              "price": {
                "type": "number"
              },
              "vat19": {
                "type": "number"
              },
              "vat7": {
                "type": "number"
              }
            },
            "required": ["name", "quantity", "price", "vat19", "vat7"]
          }
        },
        "total": {
          "type": "number"
        },
        "method": {
          "type": "string",
          "enum": ["credit card"]
        },
        "transactionDetails": {
          "type": "object",
          "properties": {
            "amount": {
              "type": "number"
            },
            "VUNumber": {
              "type": "integer"
            },
            "transactionNumber": {
              "type": "integer"
            },
            "date": {
              "type": "string",
              "format": "date"
            },
            "genNumber": {
              "type": "string"
            }
          },
          "required": ["amount", "VUNumber", "transactionNumber", "date", "genNumber"]
        }
      },
      "required": ["items", "total", "method", "transactionDetails"]
    },
    "bookingDetails": {
      "type": "object",
      "properties": {
        "bookingDate": {
          "type": "string",
          "format": "date-time"
        },
        "bookingAddress": {
          "type": "string"
        },
        "taxNumber": {
          "type": "string"
        }
      },
      "required": ["bookingDate", "bookingAddress", "taxNumber"]
    },
    "journeyDetails": {
      "type": "object",
      "properties": {
        "validFrom": {
          "type": "string",
          "format": "date"
        },
        "passengerName": {
          "type": "string"
        },
        "orderNumber": {
          "type": "string"
        },
        "stops": {
          "type": "array",
          "items": {
            "type": "object",
            "properties": {
              "stop": {
                "type": "string"
              },
              "date": {
                "type": "string",
                "format": "date"
              },
              "time": {
                "type": "string",
                "format": "time"
              },
              "track": {
                "type": "integer"
              },
              "product": {
                "type": "string"
              },
              "reservation": {
                "type": "string"
              }
            },
            "required": ["stop", "date", "time", "track", "product", "reservation"]
          }
        }
      },
      "required": ["validFrom", "passengerName", "orderNumber", "stops"]
    },
    "usageNotes": {
      "type": "string"
    }
  },
  "required": ["ticketType", "departureDate", "priceType", "class", "adult", "journey", "refundPolicy", "payment", "bookingDetails", "journeyDetails", "usageNotes"]
 }