doc processing knobs
This commit is contained in:
parent
bdb477088c
commit
dc91f55014
5 changed files with 86 additions and 9 deletions
|
|
@ -1,6 +1,6 @@
|
|||
[project]
|
||||
name = "openrag"
|
||||
version = "0.1.8"
|
||||
version = "0.1.9"
|
||||
description = "Add your description here"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.13"
|
||||
|
|
|
|||
|
|
@ -4,12 +4,12 @@ import time
|
|||
import httpx
|
||||
import requests
|
||||
from agentd.patch import patch_openai_with_mcp
|
||||
from docling.document_converter import DocumentConverter
|
||||
from dotenv import load_dotenv
|
||||
from openai import AsyncOpenAI
|
||||
from opensearchpy import AsyncOpenSearch
|
||||
from opensearchpy._async.http_aiohttp import AIOHttpConnection
|
||||
|
||||
from utils.document_processing import create_document_converter
|
||||
from utils.logging_config import get_logger
|
||||
|
||||
load_dotenv()
|
||||
|
|
@ -45,6 +45,7 @@ LANGFLOW_KEY = os.getenv("LANGFLOW_KEY")
|
|||
SESSION_SECRET = os.getenv("SESSION_SECRET", "your-secret-key-change-in-production")
|
||||
GOOGLE_OAUTH_CLIENT_ID = os.getenv("GOOGLE_OAUTH_CLIENT_ID")
|
||||
GOOGLE_OAUTH_CLIENT_SECRET = os.getenv("GOOGLE_OAUTH_CLIENT_SECRET")
|
||||
DOCLING_OCR_ENGINE = os.getenv("DOCLING_OCR_ENGINE")
|
||||
|
||||
# Ingestion configuration
|
||||
DISABLE_INGEST_WITH_LANGFLOW = os.getenv("DISABLE_INGEST_WITH_LANGFLOW", "false").lower() in ("true", "1", "yes")
|
||||
|
|
@ -287,7 +288,7 @@ class AppClients:
|
|||
self.patched_async_client = patch_openai_with_mcp(AsyncOpenAI())
|
||||
|
||||
# Initialize document converter
|
||||
self.converter = DocumentConverter()
|
||||
self.converter = create_document_converter(ocr_engine=DOCLING_OCR_ENGINE)
|
||||
|
||||
# Initialize Langflow HTTP client
|
||||
self.langflow_http_client = httpx.AsyncClient(
|
||||
|
|
|
|||
|
|
@ -12,12 +12,80 @@ logger = get_logger(__name__)
|
|||
_worker_converter = None
|
||||
|
||||
|
||||
def create_document_converter(ocr_engine: str | None = None):
|
||||
"""Create a Docling DocumentConverter with OCR disabled unless requested."""
|
||||
if ocr_engine is None:
|
||||
ocr_engine = os.getenv("DOCLING_OCR_ENGINE")
|
||||
|
||||
try:
|
||||
from docling.document_converter import (
|
||||
DocumentConverter,
|
||||
InputFormat,
|
||||
PdfFormatOption,
|
||||
)
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
except Exception as exc: # pragma: no cover - fallback path
|
||||
logger.debug(
|
||||
"Falling back to default DocumentConverter import",
|
||||
error=str(exc),
|
||||
)
|
||||
from docling.document_converter import DocumentConverter # type: ignore
|
||||
|
||||
return DocumentConverter()
|
||||
|
||||
pipeline_options = PdfPipelineOptions()
|
||||
pipeline_options.do_ocr = False
|
||||
|
||||
if ocr_engine:
|
||||
try:
|
||||
from docling.models.factories import get_ocr_factory
|
||||
|
||||
factory = get_ocr_factory(allow_external_plugins=False)
|
||||
pipeline_options.do_ocr = True
|
||||
pipeline_options.ocr_options = factory.create_options(kind=ocr_engine)
|
||||
except Exception as exc: # pragma: no cover - optional path
|
||||
pipeline_options.do_ocr = False
|
||||
logger.warning(
|
||||
"Unable to enable requested Docling OCR engine, using OCR-off",
|
||||
ocr_engine=ocr_engine,
|
||||
error=str(exc),
|
||||
)
|
||||
|
||||
format_options = {}
|
||||
if hasattr(InputFormat, "PDF"):
|
||||
format_options[getattr(InputFormat, "PDF")] = PdfFormatOption(
|
||||
pipeline_options=pipeline_options
|
||||
)
|
||||
if hasattr(InputFormat, "IMAGE"):
|
||||
format_options[getattr(InputFormat, "IMAGE")] = PdfFormatOption(
|
||||
pipeline_options=pipeline_options
|
||||
)
|
||||
|
||||
try:
|
||||
converter = DocumentConverter(
|
||||
format_options=format_options if format_options else None
|
||||
)
|
||||
except Exception as exc: # pragma: no cover - fallback path
|
||||
logger.warning(
|
||||
"Docling converter initialization failed, falling back to defaults",
|
||||
error=str(exc),
|
||||
)
|
||||
converter = DocumentConverter()
|
||||
|
||||
logger.info(
|
||||
"Docling converter initialized",
|
||||
ocr_engine=ocr_engine if pipeline_options.do_ocr else None,
|
||||
ocr_enabled=pipeline_options.do_ocr,
|
||||
)
|
||||
|
||||
return converter
|
||||
|
||||
|
||||
def get_worker_converter():
|
||||
"""Get or create a DocumentConverter instance for this worker process"""
|
||||
global _worker_converter
|
||||
if _worker_converter is None:
|
||||
from docling.document_converter import DocumentConverter
|
||||
|
||||
|
||||
# Configure GPU settings for this worker
|
||||
has_gpu_devices, _ = detect_gpu_devices()
|
||||
if not has_gpu_devices:
|
||||
|
|
@ -45,7 +113,7 @@ def get_worker_converter():
|
|||
logger.info(
|
||||
"Initializing DocumentConverter in worker process", worker_pid=os.getpid()
|
||||
)
|
||||
_worker_converter = DocumentConverter()
|
||||
_worker_converter = create_document_converter()
|
||||
logger.info("DocumentConverter ready in worker process", worker_pid=os.getpid())
|
||||
|
||||
return _worker_converter
|
||||
|
|
|
|||
2
uv.lock
generated
2
uv.lock
generated
|
|
@ -2282,7 +2282,7 @@ wheels = [
|
|||
|
||||
[[package]]
|
||||
name = "openrag"
|
||||
version = "0.1.8"
|
||||
version = "0.1.9"
|
||||
source = { editable = "." }
|
||||
dependencies = [
|
||||
{ name = "agentd" },
|
||||
|
|
|
|||
|
|
@ -1,6 +1,13 @@
|
|||
import logging
|
||||
import os
|
||||
import sys
|
||||
|
||||
from docling.document_converter import DocumentConverter
|
||||
repo_root = os.path.dirname(__file__)
|
||||
src_path = os.path.join(repo_root, "src")
|
||||
if src_path not in sys.path:
|
||||
sys.path.insert(0, src_path)
|
||||
|
||||
from utils.document_processing import create_document_converter
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
|
@ -11,7 +18,8 @@ try:
|
|||
# Use the sample document to warm up docling
|
||||
test_file = "/app/warmup_ocr.pdf"
|
||||
logger.info(f"Using test file to warm up docling: {test_file}")
|
||||
DocumentConverter().convert(test_file)
|
||||
converter = create_document_converter()
|
||||
converter.convert(test_file)
|
||||
logger.info("Docling models warmed up successfully")
|
||||
except Exception as e:
|
||||
logger.info(f"Docling warm-up completed with exception: {str(e)}")
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue