doc processing knobs
This commit is contained in:
parent
bdb477088c
commit
dc91f55014
5 changed files with 86 additions and 9 deletions
|
|
@ -1,6 +1,6 @@
|
||||||
[project]
|
[project]
|
||||||
name = "openrag"
|
name = "openrag"
|
||||||
version = "0.1.8"
|
version = "0.1.9"
|
||||||
description = "Add your description here"
|
description = "Add your description here"
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
requires-python = ">=3.13"
|
requires-python = ">=3.13"
|
||||||
|
|
|
||||||
|
|
@ -4,12 +4,12 @@ import time
|
||||||
import httpx
|
import httpx
|
||||||
import requests
|
import requests
|
||||||
from agentd.patch import patch_openai_with_mcp
|
from agentd.patch import patch_openai_with_mcp
|
||||||
from docling.document_converter import DocumentConverter
|
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
from openai import AsyncOpenAI
|
from openai import AsyncOpenAI
|
||||||
from opensearchpy import AsyncOpenSearch
|
from opensearchpy import AsyncOpenSearch
|
||||||
from opensearchpy._async.http_aiohttp import AIOHttpConnection
|
from opensearchpy._async.http_aiohttp import AIOHttpConnection
|
||||||
|
|
||||||
|
from utils.document_processing import create_document_converter
|
||||||
from utils.logging_config import get_logger
|
from utils.logging_config import get_logger
|
||||||
|
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
|
|
@ -45,6 +45,7 @@ LANGFLOW_KEY = os.getenv("LANGFLOW_KEY")
|
||||||
SESSION_SECRET = os.getenv("SESSION_SECRET", "your-secret-key-change-in-production")
|
SESSION_SECRET = os.getenv("SESSION_SECRET", "your-secret-key-change-in-production")
|
||||||
GOOGLE_OAUTH_CLIENT_ID = os.getenv("GOOGLE_OAUTH_CLIENT_ID")
|
GOOGLE_OAUTH_CLIENT_ID = os.getenv("GOOGLE_OAUTH_CLIENT_ID")
|
||||||
GOOGLE_OAUTH_CLIENT_SECRET = os.getenv("GOOGLE_OAUTH_CLIENT_SECRET")
|
GOOGLE_OAUTH_CLIENT_SECRET = os.getenv("GOOGLE_OAUTH_CLIENT_SECRET")
|
||||||
|
DOCLING_OCR_ENGINE = os.getenv("DOCLING_OCR_ENGINE")
|
||||||
|
|
||||||
# Ingestion configuration
|
# Ingestion configuration
|
||||||
DISABLE_INGEST_WITH_LANGFLOW = os.getenv("DISABLE_INGEST_WITH_LANGFLOW", "false").lower() in ("true", "1", "yes")
|
DISABLE_INGEST_WITH_LANGFLOW = os.getenv("DISABLE_INGEST_WITH_LANGFLOW", "false").lower() in ("true", "1", "yes")
|
||||||
|
|
@ -287,7 +288,7 @@ class AppClients:
|
||||||
self.patched_async_client = patch_openai_with_mcp(AsyncOpenAI())
|
self.patched_async_client = patch_openai_with_mcp(AsyncOpenAI())
|
||||||
|
|
||||||
# Initialize document converter
|
# Initialize document converter
|
||||||
self.converter = DocumentConverter()
|
self.converter = create_document_converter(ocr_engine=DOCLING_OCR_ENGINE)
|
||||||
|
|
||||||
# Initialize Langflow HTTP client
|
# Initialize Langflow HTTP client
|
||||||
self.langflow_http_client = httpx.AsyncClient(
|
self.langflow_http_client = httpx.AsyncClient(
|
||||||
|
|
|
||||||
|
|
@ -12,11 +12,79 @@ logger = get_logger(__name__)
|
||||||
_worker_converter = None
|
_worker_converter = None
|
||||||
|
|
||||||
|
|
||||||
|
def create_document_converter(ocr_engine: str | None = None):
|
||||||
|
"""Create a Docling DocumentConverter with OCR disabled unless requested."""
|
||||||
|
if ocr_engine is None:
|
||||||
|
ocr_engine = os.getenv("DOCLING_OCR_ENGINE")
|
||||||
|
|
||||||
|
try:
|
||||||
|
from docling.document_converter import (
|
||||||
|
DocumentConverter,
|
||||||
|
InputFormat,
|
||||||
|
PdfFormatOption,
|
||||||
|
)
|
||||||
|
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||||
|
except Exception as exc: # pragma: no cover - fallback path
|
||||||
|
logger.debug(
|
||||||
|
"Falling back to default DocumentConverter import",
|
||||||
|
error=str(exc),
|
||||||
|
)
|
||||||
|
from docling.document_converter import DocumentConverter # type: ignore
|
||||||
|
|
||||||
|
return DocumentConverter()
|
||||||
|
|
||||||
|
pipeline_options = PdfPipelineOptions()
|
||||||
|
pipeline_options.do_ocr = False
|
||||||
|
|
||||||
|
if ocr_engine:
|
||||||
|
try:
|
||||||
|
from docling.models.factories import get_ocr_factory
|
||||||
|
|
||||||
|
factory = get_ocr_factory(allow_external_plugins=False)
|
||||||
|
pipeline_options.do_ocr = True
|
||||||
|
pipeline_options.ocr_options = factory.create_options(kind=ocr_engine)
|
||||||
|
except Exception as exc: # pragma: no cover - optional path
|
||||||
|
pipeline_options.do_ocr = False
|
||||||
|
logger.warning(
|
||||||
|
"Unable to enable requested Docling OCR engine, using OCR-off",
|
||||||
|
ocr_engine=ocr_engine,
|
||||||
|
error=str(exc),
|
||||||
|
)
|
||||||
|
|
||||||
|
format_options = {}
|
||||||
|
if hasattr(InputFormat, "PDF"):
|
||||||
|
format_options[getattr(InputFormat, "PDF")] = PdfFormatOption(
|
||||||
|
pipeline_options=pipeline_options
|
||||||
|
)
|
||||||
|
if hasattr(InputFormat, "IMAGE"):
|
||||||
|
format_options[getattr(InputFormat, "IMAGE")] = PdfFormatOption(
|
||||||
|
pipeline_options=pipeline_options
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
converter = DocumentConverter(
|
||||||
|
format_options=format_options if format_options else None
|
||||||
|
)
|
||||||
|
except Exception as exc: # pragma: no cover - fallback path
|
||||||
|
logger.warning(
|
||||||
|
"Docling converter initialization failed, falling back to defaults",
|
||||||
|
error=str(exc),
|
||||||
|
)
|
||||||
|
converter = DocumentConverter()
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
"Docling converter initialized",
|
||||||
|
ocr_engine=ocr_engine if pipeline_options.do_ocr else None,
|
||||||
|
ocr_enabled=pipeline_options.do_ocr,
|
||||||
|
)
|
||||||
|
|
||||||
|
return converter
|
||||||
|
|
||||||
|
|
||||||
def get_worker_converter():
|
def get_worker_converter():
|
||||||
"""Get or create a DocumentConverter instance for this worker process"""
|
"""Get or create a DocumentConverter instance for this worker process"""
|
||||||
global _worker_converter
|
global _worker_converter
|
||||||
if _worker_converter is None:
|
if _worker_converter is None:
|
||||||
from docling.document_converter import DocumentConverter
|
|
||||||
|
|
||||||
# Configure GPU settings for this worker
|
# Configure GPU settings for this worker
|
||||||
has_gpu_devices, _ = detect_gpu_devices()
|
has_gpu_devices, _ = detect_gpu_devices()
|
||||||
|
|
@ -45,7 +113,7 @@ def get_worker_converter():
|
||||||
logger.info(
|
logger.info(
|
||||||
"Initializing DocumentConverter in worker process", worker_pid=os.getpid()
|
"Initializing DocumentConverter in worker process", worker_pid=os.getpid()
|
||||||
)
|
)
|
||||||
_worker_converter = DocumentConverter()
|
_worker_converter = create_document_converter()
|
||||||
logger.info("DocumentConverter ready in worker process", worker_pid=os.getpid())
|
logger.info("DocumentConverter ready in worker process", worker_pid=os.getpid())
|
||||||
|
|
||||||
return _worker_converter
|
return _worker_converter
|
||||||
|
|
|
||||||
2
uv.lock
generated
2
uv.lock
generated
|
|
@ -2282,7 +2282,7 @@ wheels = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "openrag"
|
name = "openrag"
|
||||||
version = "0.1.8"
|
version = "0.1.9"
|
||||||
source = { editable = "." }
|
source = { editable = "." }
|
||||||
dependencies = [
|
dependencies = [
|
||||||
{ name = "agentd" },
|
{ name = "agentd" },
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,13 @@
|
||||||
import logging
|
import logging
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
from docling.document_converter import DocumentConverter
|
repo_root = os.path.dirname(__file__)
|
||||||
|
src_path = os.path.join(repo_root, "src")
|
||||||
|
if src_path not in sys.path:
|
||||||
|
sys.path.insert(0, src_path)
|
||||||
|
|
||||||
|
from utils.document_processing import create_document_converter
|
||||||
|
|
||||||
logging.basicConfig(level=logging.INFO)
|
logging.basicConfig(level=logging.INFO)
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
@ -11,7 +18,8 @@ try:
|
||||||
# Use the sample document to warm up docling
|
# Use the sample document to warm up docling
|
||||||
test_file = "/app/warmup_ocr.pdf"
|
test_file = "/app/warmup_ocr.pdf"
|
||||||
logger.info(f"Using test file to warm up docling: {test_file}")
|
logger.info(f"Using test file to warm up docling: {test_file}")
|
||||||
DocumentConverter().convert(test_file)
|
converter = create_document_converter()
|
||||||
|
converter.convert(test_file)
|
||||||
logger.info("Docling models warmed up successfully")
|
logger.info("Docling models warmed up successfully")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.info(f"Docling warm-up completed with exception: {str(e)}")
|
logger.info(f"Docling warm-up completed with exception: {str(e)}")
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue