doc processing knobs

This commit is contained in:
phact 2025-09-18 16:27:01 -04:00
parent bdb477088c
commit dc91f55014
5 changed files with 86 additions and 9 deletions

View file

@ -1,6 +1,6 @@
[project]
name = "openrag"
version = "0.1.8"
version = "0.1.9"
description = "Add your description here"
readme = "README.md"
requires-python = ">=3.13"

View file

@ -4,12 +4,12 @@ import time
import httpx
import requests
from agentd.patch import patch_openai_with_mcp
from docling.document_converter import DocumentConverter
from dotenv import load_dotenv
from openai import AsyncOpenAI
from opensearchpy import AsyncOpenSearch
from opensearchpy._async.http_aiohttp import AIOHttpConnection
from utils.document_processing import create_document_converter
from utils.logging_config import get_logger
load_dotenv()
@ -45,6 +45,7 @@ LANGFLOW_KEY = os.getenv("LANGFLOW_KEY")
SESSION_SECRET = os.getenv("SESSION_SECRET", "your-secret-key-change-in-production")
GOOGLE_OAUTH_CLIENT_ID = os.getenv("GOOGLE_OAUTH_CLIENT_ID")
GOOGLE_OAUTH_CLIENT_SECRET = os.getenv("GOOGLE_OAUTH_CLIENT_SECRET")
DOCLING_OCR_ENGINE = os.getenv("DOCLING_OCR_ENGINE")
# Ingestion configuration
DISABLE_INGEST_WITH_LANGFLOW = os.getenv("DISABLE_INGEST_WITH_LANGFLOW", "false").lower() in ("true", "1", "yes")
@ -287,7 +288,7 @@ class AppClients:
self.patched_async_client = patch_openai_with_mcp(AsyncOpenAI())
# Initialize document converter
self.converter = DocumentConverter()
self.converter = create_document_converter(ocr_engine=DOCLING_OCR_ENGINE)
# Initialize Langflow HTTP client
self.langflow_http_client = httpx.AsyncClient(

View file

@ -12,12 +12,80 @@ logger = get_logger(__name__)
_worker_converter = None
def create_document_converter(ocr_engine: str | None = None):
"""Create a Docling DocumentConverter with OCR disabled unless requested."""
if ocr_engine is None:
ocr_engine = os.getenv("DOCLING_OCR_ENGINE")
try:
from docling.document_converter import (
DocumentConverter,
InputFormat,
PdfFormatOption,
)
from docling.datamodel.pipeline_options import PdfPipelineOptions
except Exception as exc: # pragma: no cover - fallback path
logger.debug(
"Falling back to default DocumentConverter import",
error=str(exc),
)
from docling.document_converter import DocumentConverter # type: ignore
return DocumentConverter()
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = False
if ocr_engine:
try:
from docling.models.factories import get_ocr_factory
factory = get_ocr_factory(allow_external_plugins=False)
pipeline_options.do_ocr = True
pipeline_options.ocr_options = factory.create_options(kind=ocr_engine)
except Exception as exc: # pragma: no cover - optional path
pipeline_options.do_ocr = False
logger.warning(
"Unable to enable requested Docling OCR engine, using OCR-off",
ocr_engine=ocr_engine,
error=str(exc),
)
format_options = {}
if hasattr(InputFormat, "PDF"):
format_options[getattr(InputFormat, "PDF")] = PdfFormatOption(
pipeline_options=pipeline_options
)
if hasattr(InputFormat, "IMAGE"):
format_options[getattr(InputFormat, "IMAGE")] = PdfFormatOption(
pipeline_options=pipeline_options
)
try:
converter = DocumentConverter(
format_options=format_options if format_options else None
)
except Exception as exc: # pragma: no cover - fallback path
logger.warning(
"Docling converter initialization failed, falling back to defaults",
error=str(exc),
)
converter = DocumentConverter()
logger.info(
"Docling converter initialized",
ocr_engine=ocr_engine if pipeline_options.do_ocr else None,
ocr_enabled=pipeline_options.do_ocr,
)
return converter
def get_worker_converter():
"""Get or create a DocumentConverter instance for this worker process"""
global _worker_converter
if _worker_converter is None:
from docling.document_converter import DocumentConverter
# Configure GPU settings for this worker
has_gpu_devices, _ = detect_gpu_devices()
if not has_gpu_devices:
@ -45,7 +113,7 @@ def get_worker_converter():
logger.info(
"Initializing DocumentConverter in worker process", worker_pid=os.getpid()
)
_worker_converter = DocumentConverter()
_worker_converter = create_document_converter()
logger.info("DocumentConverter ready in worker process", worker_pid=os.getpid())
return _worker_converter

2
uv.lock generated
View file

@ -2282,7 +2282,7 @@ wheels = [
[[package]]
name = "openrag"
version = "0.1.8"
version = "0.1.9"
source = { editable = "." }
dependencies = [
{ name = "agentd" },

View file

@ -1,6 +1,13 @@
import logging
import os
import sys
from docling.document_converter import DocumentConverter
repo_root = os.path.dirname(__file__)
src_path = os.path.join(repo_root, "src")
if src_path not in sys.path:
sys.path.insert(0, src_path)
from utils.document_processing import create_document_converter
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
@ -11,7 +18,8 @@ try:
# Use the sample document to warm up docling
test_file = "/app/warmup_ocr.pdf"
logger.info(f"Using test file to warm up docling: {test_file}")
DocumentConverter().convert(test_file)
converter = create_document_converter()
converter.convert(test_file)
logger.info("Docling models warmed up successfully")
except Exception as e:
logger.info(f"Docling warm-up completed with exception: {str(e)}")