From dc91f55014794f76def3bbb02e6f2e4d2aeab78c Mon Sep 17 00:00:00 2001 From: phact Date: Thu, 18 Sep 2025 16:27:01 -0400 Subject: [PATCH] doc processing knobs --- pyproject.toml | 2 +- src/config/settings.py | 5 ++- src/utils/document_processing.py | 74 ++++++++++++++++++++++++++++++-- uv.lock | 2 +- warm_up_docling.py | 12 +++++- 5 files changed, 86 insertions(+), 9 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index a2a0e41f..d02bdc0d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "openrag" -version = "0.1.8" +version = "0.1.9" description = "Add your description here" readme = "README.md" requires-python = ">=3.13" diff --git a/src/config/settings.py b/src/config/settings.py index 715146fb..9a580c76 100644 --- a/src/config/settings.py +++ b/src/config/settings.py @@ -4,12 +4,12 @@ import time import httpx import requests from agentd.patch import patch_openai_with_mcp -from docling.document_converter import DocumentConverter from dotenv import load_dotenv from openai import AsyncOpenAI from opensearchpy import AsyncOpenSearch from opensearchpy._async.http_aiohttp import AIOHttpConnection +from utils.document_processing import create_document_converter from utils.logging_config import get_logger load_dotenv() @@ -45,6 +45,7 @@ LANGFLOW_KEY = os.getenv("LANGFLOW_KEY") SESSION_SECRET = os.getenv("SESSION_SECRET", "your-secret-key-change-in-production") GOOGLE_OAUTH_CLIENT_ID = os.getenv("GOOGLE_OAUTH_CLIENT_ID") GOOGLE_OAUTH_CLIENT_SECRET = os.getenv("GOOGLE_OAUTH_CLIENT_SECRET") +DOCLING_OCR_ENGINE = os.getenv("DOCLING_OCR_ENGINE") # Ingestion configuration DISABLE_INGEST_WITH_LANGFLOW = os.getenv("DISABLE_INGEST_WITH_LANGFLOW", "false").lower() in ("true", "1", "yes") @@ -287,7 +288,7 @@ class AppClients: self.patched_async_client = patch_openai_with_mcp(AsyncOpenAI()) # Initialize document converter - self.converter = DocumentConverter() + self.converter = create_document_converter(ocr_engine=DOCLING_OCR_ENGINE) # Initialize Langflow HTTP client self.langflow_http_client = httpx.AsyncClient( diff --git a/src/utils/document_processing.py b/src/utils/document_processing.py index e46d8f16..a8792e46 100644 --- a/src/utils/document_processing.py +++ b/src/utils/document_processing.py @@ -12,12 +12,80 @@ logger = get_logger(__name__) _worker_converter = None +def create_document_converter(ocr_engine: str | None = None): + """Create a Docling DocumentConverter with OCR disabled unless requested.""" + if ocr_engine is None: + ocr_engine = os.getenv("DOCLING_OCR_ENGINE") + + try: + from docling.document_converter import ( + DocumentConverter, + InputFormat, + PdfFormatOption, + ) + from docling.datamodel.pipeline_options import PdfPipelineOptions + except Exception as exc: # pragma: no cover - fallback path + logger.debug( + "Falling back to default DocumentConverter import", + error=str(exc), + ) + from docling.document_converter import DocumentConverter # type: ignore + + return DocumentConverter() + + pipeline_options = PdfPipelineOptions() + pipeline_options.do_ocr = False + + if ocr_engine: + try: + from docling.models.factories import get_ocr_factory + + factory = get_ocr_factory(allow_external_plugins=False) + pipeline_options.do_ocr = True + pipeline_options.ocr_options = factory.create_options(kind=ocr_engine) + except Exception as exc: # pragma: no cover - optional path + pipeline_options.do_ocr = False + logger.warning( + "Unable to enable requested Docling OCR engine, using OCR-off", + ocr_engine=ocr_engine, + error=str(exc), + ) + + format_options = {} + if hasattr(InputFormat, "PDF"): + format_options[getattr(InputFormat, "PDF")] = PdfFormatOption( + pipeline_options=pipeline_options + ) + if hasattr(InputFormat, "IMAGE"): + format_options[getattr(InputFormat, "IMAGE")] = PdfFormatOption( + pipeline_options=pipeline_options + ) + + try: + converter = DocumentConverter( + format_options=format_options if format_options else None + ) + except Exception as exc: # pragma: no cover - fallback path + logger.warning( + "Docling converter initialization failed, falling back to defaults", + error=str(exc), + ) + converter = DocumentConverter() + + logger.info( + "Docling converter initialized", + ocr_engine=ocr_engine if pipeline_options.do_ocr else None, + ocr_enabled=pipeline_options.do_ocr, + ) + + return converter + + def get_worker_converter(): """Get or create a DocumentConverter instance for this worker process""" global _worker_converter if _worker_converter is None: - from docling.document_converter import DocumentConverter - + # Configure GPU settings for this worker has_gpu_devices, _ = detect_gpu_devices() if not has_gpu_devices: @@ -45,7 +113,7 @@ def get_worker_converter(): logger.info( "Initializing DocumentConverter in worker process", worker_pid=os.getpid() ) - _worker_converter = DocumentConverter() + _worker_converter = create_document_converter() logger.info("DocumentConverter ready in worker process", worker_pid=os.getpid()) return _worker_converter diff --git a/uv.lock b/uv.lock index 0a60fd52..841eb9fd 100644 --- a/uv.lock +++ b/uv.lock @@ -2282,7 +2282,7 @@ wheels = [ [[package]] name = "openrag" -version = "0.1.8" +version = "0.1.9" source = { editable = "." } dependencies = [ { name = "agentd" }, diff --git a/warm_up_docling.py b/warm_up_docling.py index c605bef5..3a834e2f 100644 --- a/warm_up_docling.py +++ b/warm_up_docling.py @@ -1,6 +1,13 @@ import logging +import os +import sys -from docling.document_converter import DocumentConverter +repo_root = os.path.dirname(__file__) +src_path = os.path.join(repo_root, "src") +if src_path not in sys.path: + sys.path.insert(0, src_path) + +from utils.document_processing import create_document_converter logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -11,7 +18,8 @@ try: # Use the sample document to warm up docling test_file = "/app/warmup_ocr.pdf" logger.info(f"Using test file to warm up docling: {test_file}") - DocumentConverter().convert(test_file) + converter = create_document_converter() + converter.convert(test_file) logger.info("Docling models warmed up successfully") except Exception as e: logger.info(f"Docling warm-up completed with exception: {str(e)}")