doc processing knobs

This commit is contained in:
phact 2025-09-18 16:27:01 -04:00
parent bdb477088c
commit dc91f55014
5 changed files with 86 additions and 9 deletions

View file

@ -1,6 +1,6 @@
[project] [project]
name = "openrag" name = "openrag"
version = "0.1.8" version = "0.1.9"
description = "Add your description here" description = "Add your description here"
readme = "README.md" readme = "README.md"
requires-python = ">=3.13" requires-python = ">=3.13"

View file

@ -4,12 +4,12 @@ import time
import httpx import httpx
import requests import requests
from agentd.patch import patch_openai_with_mcp from agentd.patch import patch_openai_with_mcp
from docling.document_converter import DocumentConverter
from dotenv import load_dotenv from dotenv import load_dotenv
from openai import AsyncOpenAI from openai import AsyncOpenAI
from opensearchpy import AsyncOpenSearch from opensearchpy import AsyncOpenSearch
from opensearchpy._async.http_aiohttp import AIOHttpConnection from opensearchpy._async.http_aiohttp import AIOHttpConnection
from utils.document_processing import create_document_converter
from utils.logging_config import get_logger from utils.logging_config import get_logger
load_dotenv() load_dotenv()
@ -45,6 +45,7 @@ LANGFLOW_KEY = os.getenv("LANGFLOW_KEY")
SESSION_SECRET = os.getenv("SESSION_SECRET", "your-secret-key-change-in-production") SESSION_SECRET = os.getenv("SESSION_SECRET", "your-secret-key-change-in-production")
GOOGLE_OAUTH_CLIENT_ID = os.getenv("GOOGLE_OAUTH_CLIENT_ID") GOOGLE_OAUTH_CLIENT_ID = os.getenv("GOOGLE_OAUTH_CLIENT_ID")
GOOGLE_OAUTH_CLIENT_SECRET = os.getenv("GOOGLE_OAUTH_CLIENT_SECRET") GOOGLE_OAUTH_CLIENT_SECRET = os.getenv("GOOGLE_OAUTH_CLIENT_SECRET")
DOCLING_OCR_ENGINE = os.getenv("DOCLING_OCR_ENGINE")
# Ingestion configuration # Ingestion configuration
DISABLE_INGEST_WITH_LANGFLOW = os.getenv("DISABLE_INGEST_WITH_LANGFLOW", "false").lower() in ("true", "1", "yes") DISABLE_INGEST_WITH_LANGFLOW = os.getenv("DISABLE_INGEST_WITH_LANGFLOW", "false").lower() in ("true", "1", "yes")
@ -287,7 +288,7 @@ class AppClients:
self.patched_async_client = patch_openai_with_mcp(AsyncOpenAI()) self.patched_async_client = patch_openai_with_mcp(AsyncOpenAI())
# Initialize document converter # Initialize document converter
self.converter = DocumentConverter() self.converter = create_document_converter(ocr_engine=DOCLING_OCR_ENGINE)
# Initialize Langflow HTTP client # Initialize Langflow HTTP client
self.langflow_http_client = httpx.AsyncClient( self.langflow_http_client = httpx.AsyncClient(

View file

@ -12,11 +12,79 @@ logger = get_logger(__name__)
_worker_converter = None _worker_converter = None
def create_document_converter(ocr_engine: str | None = None):
"""Create a Docling DocumentConverter with OCR disabled unless requested."""
if ocr_engine is None:
ocr_engine = os.getenv("DOCLING_OCR_ENGINE")
try:
from docling.document_converter import (
DocumentConverter,
InputFormat,
PdfFormatOption,
)
from docling.datamodel.pipeline_options import PdfPipelineOptions
except Exception as exc: # pragma: no cover - fallback path
logger.debug(
"Falling back to default DocumentConverter import",
error=str(exc),
)
from docling.document_converter import DocumentConverter # type: ignore
return DocumentConverter()
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = False
if ocr_engine:
try:
from docling.models.factories import get_ocr_factory
factory = get_ocr_factory(allow_external_plugins=False)
pipeline_options.do_ocr = True
pipeline_options.ocr_options = factory.create_options(kind=ocr_engine)
except Exception as exc: # pragma: no cover - optional path
pipeline_options.do_ocr = False
logger.warning(
"Unable to enable requested Docling OCR engine, using OCR-off",
ocr_engine=ocr_engine,
error=str(exc),
)
format_options = {}
if hasattr(InputFormat, "PDF"):
format_options[getattr(InputFormat, "PDF")] = PdfFormatOption(
pipeline_options=pipeline_options
)
if hasattr(InputFormat, "IMAGE"):
format_options[getattr(InputFormat, "IMAGE")] = PdfFormatOption(
pipeline_options=pipeline_options
)
try:
converter = DocumentConverter(
format_options=format_options if format_options else None
)
except Exception as exc: # pragma: no cover - fallback path
logger.warning(
"Docling converter initialization failed, falling back to defaults",
error=str(exc),
)
converter = DocumentConverter()
logger.info(
"Docling converter initialized",
ocr_engine=ocr_engine if pipeline_options.do_ocr else None,
ocr_enabled=pipeline_options.do_ocr,
)
return converter
def get_worker_converter(): def get_worker_converter():
"""Get or create a DocumentConverter instance for this worker process""" """Get or create a DocumentConverter instance for this worker process"""
global _worker_converter global _worker_converter
if _worker_converter is None: if _worker_converter is None:
from docling.document_converter import DocumentConverter
# Configure GPU settings for this worker # Configure GPU settings for this worker
has_gpu_devices, _ = detect_gpu_devices() has_gpu_devices, _ = detect_gpu_devices()
@ -45,7 +113,7 @@ def get_worker_converter():
logger.info( logger.info(
"Initializing DocumentConverter in worker process", worker_pid=os.getpid() "Initializing DocumentConverter in worker process", worker_pid=os.getpid()
) )
_worker_converter = DocumentConverter() _worker_converter = create_document_converter()
logger.info("DocumentConverter ready in worker process", worker_pid=os.getpid()) logger.info("DocumentConverter ready in worker process", worker_pid=os.getpid())
return _worker_converter return _worker_converter

2
uv.lock generated
View file

@ -2282,7 +2282,7 @@ wheels = [
[[package]] [[package]]
name = "openrag" name = "openrag"
version = "0.1.8" version = "0.1.9"
source = { editable = "." } source = { editable = "." }
dependencies = [ dependencies = [
{ name = "agentd" }, { name = "agentd" },

View file

@ -1,6 +1,13 @@
import logging import logging
import os
import sys
from docling.document_converter import DocumentConverter repo_root = os.path.dirname(__file__)
src_path = os.path.join(repo_root, "src")
if src_path not in sys.path:
sys.path.insert(0, src_path)
from utils.document_processing import create_document_converter
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -11,7 +18,8 @@ try:
# Use the sample document to warm up docling # Use the sample document to warm up docling
test_file = "/app/warmup_ocr.pdf" test_file = "/app/warmup_ocr.pdf"
logger.info(f"Using test file to warm up docling: {test_file}") logger.info(f"Using test file to warm up docling: {test_file}")
DocumentConverter().convert(test_file) converter = create_document_converter()
converter.convert(test_file)
logger.info("Docling models warmed up successfully") logger.info("Docling models warmed up successfully")
except Exception as e: except Exception as e:
logger.info(f"Docling warm-up completed with exception: {str(e)}") logger.info(f"Docling warm-up completed with exception: {str(e)}")