39 lines
1.3 KiB
Python
39 lines
1.3 KiB
Python
import os
|
|
from io import BytesIO
|
|
|
|
import fitz
|
|
from level_2.chunkers.chunkers import chunk_data
|
|
from langchain.document_loaders import PyPDFLoader
|
|
|
|
import requests
|
|
def _document_loader( observation: str, loader_settings: dict):
|
|
# Check the format of the document
|
|
document_format = loader_settings.get("format", "text")
|
|
|
|
if document_format == "PDF":
|
|
if loader_settings.get("source") == "url":
|
|
pdf_response = requests.get(loader_settings["path"])
|
|
pdf_stream = BytesIO(pdf_response.content)
|
|
with fitz.open(stream=pdf_stream, filetype='pdf') as doc:
|
|
file_content = ""
|
|
for page in doc:
|
|
file_content += page.get_text()
|
|
pages = chunk_data(chunk_strategy= 'VANILLA', source_data=file_content)
|
|
|
|
return pages
|
|
elif loader_settings.get("source") == "file":
|
|
# Process the PDF using PyPDFLoader
|
|
# might need adapting for different loaders + OCR
|
|
# need to test the path
|
|
loader = PyPDFLoader(loader_settings["path"])
|
|
pages = loader.load_and_split()
|
|
return pages
|
|
|
|
elif document_format == "text":
|
|
# Process the text directly
|
|
return observation
|
|
|
|
else:
|
|
raise ValueError(f"Unsupported document format: {document_format}")
|
|
|
|
|