1. Dynamic metadata retrieval, refactored function 2. Load with using marshmallow, allows dynamic fields now 3. Added chunkers, different varieties 4. Fixed PDF loading so it is better standardized
39 lines
1.3 KiB
Python
39 lines
1.3 KiB
Python
import os
|
|
from io import BytesIO
|
|
|
|
import fitz
|
|
from level_2.chunkers.chunkers import chunk_data
|
|
from langchain.document_loaders import PyPDFLoader
|
|
|
|
import requests
|
|
def _document_loader( observation: str, loader_settings: dict):
|
|
# Check the format of the document
|
|
document_format = loader_settings.get("format", "text")
|
|
|
|
if document_format == "PDF":
|
|
if loader_settings.get("source") == "url":
|
|
pdf_response = requests.get(loader_settings["path"])
|
|
pdf_stream = BytesIO(pdf_response.content)
|
|
with fitz.open(stream=pdf_stream, filetype='pdf') as doc:
|
|
file_content = ""
|
|
for page in doc:
|
|
file_content += page.get_text()
|
|
pages = chunk_data(chunk_strategy= 'VANILLA', source_data=file_content)
|
|
|
|
return pages
|
|
elif loader_settings.get("source") == "file":
|
|
# Process the PDF using PyPDFLoader
|
|
# might need adapting for different loaders + OCR
|
|
# need to test the path
|
|
loader = PyPDFLoader(loader_settings["path"])
|
|
pages = loader.load_and_split()
|
|
return pages
|
|
|
|
elif document_format == "text":
|
|
# Process the text directly
|
|
return observation
|
|
|
|
else:
|
|
raise ValueError(f"Unsupported document format: {document_format}")
|
|
|
|
|