1. Dynamic metadata retrieval, refactored function 2. Load with using marshmallow, allows dynamic fields now 3. Added chunkers, different varieties 4. Fixed PDF loading so it is better standardized
41 lines
1.4 KiB
Python
41 lines
1.4 KiB
Python
import os
|
|
from io import BytesIO
|
|
import sys, os
|
|
import fitz
|
|
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
|
|
|
from chunkers.chunkers import chunk_data
|
|
from langchain.document_loaders import PyPDFLoader
|
|
|
|
import requests
|
|
def _document_loader( observation: str, loader_settings: dict):
|
|
# Check the format of the document
|
|
document_format = loader_settings.get("format", "text")
|
|
|
|
if document_format == "PDF":
|
|
if loader_settings.get("source") == "url":
|
|
pdf_response = requests.get(loader_settings["path"])
|
|
pdf_stream = BytesIO(pdf_response.content)
|
|
with fitz.open(stream=pdf_stream, filetype='pdf') as doc:
|
|
file_content = ""
|
|
for page in doc:
|
|
file_content += page.get_text()
|
|
pages = chunk_data(chunk_strategy= 'VANILLA', source_data=file_content)
|
|
|
|
return pages
|
|
elif loader_settings.get("source") == "file":
|
|
# Process the PDF using PyPDFLoader
|
|
# might need adapting for different loaders + OCR
|
|
# need to test the path
|
|
loader = PyPDFLoader(loader_settings["path"])
|
|
pages = loader.load_and_split()
|
|
return pages
|
|
|
|
elif document_format == "text":
|
|
# Process the text directly
|
|
return observation
|
|
|
|
else:
|
|
raise ValueError(f"Unsupported document format: {document_format}")
|
|
|
|
|