openrag/src/tui/managers/docling_manager.py
2025-09-16 10:17:24 -04:00

403 lines
16 KiB
Python

"""Docling serve manager for local document processing service."""
import asyncio
import os
import subprocess
import sys
import threading
import time
from typing import Optional, Tuple, Dict, Any, List, AsyncIterator
from utils.logging_config import get_logger
logger = get_logger(__name__)
class DoclingManager:
"""Manages local docling serve instance as external process."""
_instance = None
_initialized = False
def __new__(cls):
if cls._instance is None:
cls._instance = super().__new__(cls)
return cls._instance
def __init__(self):
# Only initialize once
if self._initialized:
return
self._process: Optional[subprocess.Popen] = None
self._port = 5001
self._host = "127.0.0.1"
self._running = False
self._external_process = False
# Log storage - simplified, no queue
self._log_buffer: List[str] = []
self._max_log_lines = 1000
self._log_lock = threading.Lock() # Thread-safe access to log buffer
self._initialized = True
def cleanup(self):
"""Cleanup resources and stop any running processes."""
if self._process and self._process.poll() is None:
self._add_log_entry("Cleaning up docling-serve process on exit")
try:
self._process.terminate()
self._process.wait(timeout=5)
except subprocess.TimeoutExpired:
self._process.kill()
self._process.wait()
except Exception as e:
self._add_log_entry(f"Error during cleanup: {e}")
self._running = False
self._process = None
def _add_log_entry(self, message: str) -> None:
"""Add a log entry to the buffer (thread-safe)."""
timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
entry = f"[{timestamp}] {message}"
with self._log_lock:
self._log_buffer.append(entry)
# Keep buffer size limited
if len(self._log_buffer) > self._max_log_lines:
self._log_buffer = self._log_buffer[-self._max_log_lines:]
def is_running(self) -> bool:
"""Check if docling serve is running."""
# First check our internal state
internal_running = self._running and self._process is not None and self._process.poll() is None
# If we think it's not running, check if something is listening on the port
# This handles cases where docling-serve was started outside the TUI
if not internal_running:
try:
import socket
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.settimeout(0.5)
result = s.connect_ex((self._host, self._port))
s.close()
# If port is in use, something is running there
if result == 0:
# Only log this once when we first detect external process
if not self._external_process:
self._add_log_entry(f"Detected external docling-serve running on {self._host}:{self._port}")
# Set a flag to indicate this is an external process
self._external_process = True
return True
except Exception as e:
# Only log errors occasionally to avoid spam
if not hasattr(self, '_last_port_error') or self._last_port_error != str(e):
self._add_log_entry(f"Error checking port: {e}")
self._last_port_error = str(e)
else:
# If we started it, it's not external
self._external_process = False
return internal_running
def get_status(self) -> Dict[str, Any]:
"""Get current status of docling serve."""
if self.is_running():
pid = self._process.pid if self._process else None
return {
"status": "running",
"port": self._port,
"host": self._host,
"endpoint": f"http://{self._host}:{self._port}",
"docs_url": f"http://{self._host}:{self._port}/docs",
"ui_url": f"http://{self._host}:{self._port}/ui",
"pid": pid
}
else:
return {
"status": "stopped",
"port": self._port,
"host": self._host,
"endpoint": None,
"docs_url": None,
"ui_url": None,
"pid": None
}
async def start(self, port: int = 5001, host: str = "127.0.0.1", enable_ui: bool = False) -> Tuple[bool, str]:
"""Start docling serve as external process."""
if self.is_running():
return False, "Docling serve is already running"
self._port = port
self._host = host
# Clear log buffer when starting
self._log_buffer = []
self._add_log_entry("Starting docling serve as external process...")
try:
# Build command to run docling-serve
# Check if we should use uv run (look for uv in environment or check if we're in a uv project)
import shutil
if shutil.which("uv") and (os.path.exists("pyproject.toml") or os.getenv("VIRTUAL_ENV")):
cmd = [
"uv", "run", "python", "-m", "docling_serve", "run",
"--host", host,
"--port", str(port),
]
else:
cmd = [
sys.executable, "-m", "docling_serve", "run",
"--host", host,
"--port", str(port),
]
if enable_ui:
cmd.append("--enable-ui")
self._add_log_entry(f"Starting process: {' '.join(cmd)}")
# Start as subprocess
self._process = subprocess.Popen(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
universal_newlines=True,
bufsize=0 # Unbuffered for real-time output
)
self._running = True
self._add_log_entry("External process started")
# Start a thread to capture output
self._start_output_capture()
# Wait for the process to start and begin listening
self._add_log_entry("Waiting for docling-serve to start listening...")
# Wait up to 10 seconds for the service to start listening
for i in range(10):
await asyncio.sleep(1.0)
# Check if process is still alive
if self._process.poll() is not None:
break
# Check if it's listening on the port
try:
import socket
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.settimeout(0.5)
result = s.connect_ex((host, port))
s.close()
if result == 0:
self._add_log_entry(f"Docling-serve is now listening on {host}:{port}")
break
except:
pass
self._add_log_entry(f"Waiting for startup... ({i+1}/10)")
# Add a test message to verify logging is working
self._add_log_entry(f"Process PID: {self._process.pid}, Poll: {self._process.poll()}")
if self._process.poll() is not None:
# Process already exited - get return code and any output
return_code = self._process.returncode
self._add_log_entry(f"Process exited with code: {return_code}")
try:
# Try to read any remaining output
stdout_data = ""
stderr_data = ""
if self._process.stdout:
stdout_data = self._process.stdout.read()
if self._process.stderr:
stderr_data = self._process.stderr.read()
if stdout_data:
self._add_log_entry(f"Final stdout: {stdout_data[:500]}")
if stderr_data:
self._add_log_entry(f"Final stderr: {stderr_data[:500]}")
except Exception as e:
self._add_log_entry(f"Error reading final output: {e}")
self._running = False
return False, f"Docling serve process exited immediately (code: {return_code})"
return True, f"Docling serve starting on http://{host}:{port}"
except FileNotFoundError:
return False, "docling-serve not available. Please install: uv add docling-serve"
except Exception as e:
self._running = False
self._process = None
return False, f"Error starting docling serve: {str(e)}"
def _start_output_capture(self):
"""Start threads to capture subprocess stdout and stderr."""
def capture_stdout():
if not self._process or not self._process.stdout:
self._add_log_entry("No stdout pipe available")
return
self._add_log_entry("Starting stdout capture thread")
try:
while self._running and self._process and self._process.poll() is None:
line = self._process.stdout.readline()
if line:
self._add_log_entry(f"STDOUT: {line.rstrip()}")
else:
# No more output, wait a bit
time.sleep(0.1)
except Exception as e:
self._add_log_entry(f"Error capturing stdout: {e}")
finally:
self._add_log_entry("Stdout capture thread ended")
def capture_stderr():
if not self._process or not self._process.stderr:
self._add_log_entry("No stderr pipe available")
return
self._add_log_entry("Starting stderr capture thread")
try:
while self._running and self._process and self._process.poll() is None:
line = self._process.stderr.readline()
if line:
self._add_log_entry(f"STDERR: {line.rstrip()}")
else:
# No more output, wait a bit
time.sleep(0.1)
except Exception as e:
self._add_log_entry(f"Error capturing stderr: {e}")
finally:
self._add_log_entry("Stderr capture thread ended")
# Start both capture threads
stdout_thread = threading.Thread(target=capture_stdout, daemon=True)
stderr_thread = threading.Thread(target=capture_stderr, daemon=True)
stdout_thread.start()
stderr_thread.start()
self._add_log_entry("Output capture threads started")
async def stop(self) -> Tuple[bool, str]:
"""Stop docling serve."""
if not self.is_running():
return False, "Docling serve is not running"
try:
self._add_log_entry("Stopping docling-serve process")
if self._process:
# We started this process, so we can stop it directly
self._add_log_entry(f"Terminating our process (PID: {self._process.pid})")
self._process.terminate()
# Wait for it to stop
try:
self._process.wait(timeout=10)
self._add_log_entry("Process terminated gracefully")
except subprocess.TimeoutExpired:
# Force kill if it doesn't stop gracefully
self._add_log_entry("Process didn't stop gracefully, force killing")
self._process.kill()
self._process.wait()
self._add_log_entry("Process force killed")
elif self._external_process:
# This is an external process, we can't stop it directly
self._add_log_entry("Cannot stop external docling-serve process - it was started outside the TUI")
self._running = False
self._external_process = False
return False, "Cannot stop external docling-serve process. Please stop it manually."
self._running = False
self._process = None
self._external_process = False
self._add_log_entry("Docling serve stopped successfully")
return True, "Docling serve stopped successfully"
except Exception as e:
self._add_log_entry(f"Error stopping docling serve: {e}")
return False, f"Error stopping docling serve: {str(e)}"
async def restart(self, port: Optional[int] = None, host: Optional[str] = None, enable_ui: bool = False) -> Tuple[bool, str]:
"""Restart docling serve."""
# Use current settings if not specified
if port is None:
port = self._port
if host is None:
host = self._host
# Stop if running
if self.is_running():
success, msg = await self.stop()
if not success:
return False, f"Failed to stop: {msg}"
# Wait a moment for cleanup
await asyncio.sleep(1)
# Start with new settings
return await self.start(port, host, enable_ui)
def add_manual_log_entry(self, message: str) -> None:
"""Add a manual log entry - useful for debugging."""
self._add_log_entry(f"MANUAL: {message}")
def get_logs(self, lines: int = 50) -> Tuple[bool, str]:
"""Get logs from the docling-serve process."""
if self.is_running():
with self._log_lock:
# If we have no logs but the service is running, it might have been started externally
if not self._log_buffer:
return True, "No logs available yet..."
# Return the most recent logs
log_count = min(lines, len(self._log_buffer))
logs = "\n".join(self._log_buffer[-log_count:])
return True, logs
else:
return True, "Docling serve is not running."
async def follow_logs(self) -> AsyncIterator[str]:
"""Follow logs from the docling-serve process in real-time."""
# First yield status message and any existing logs
status_msg = f"Docling serve is running on http://{self._host}:{self._port}"
with self._log_lock:
if self._log_buffer:
yield "\n".join(self._log_buffer)
last_log_index = len(self._log_buffer)
else:
yield "Waiting for logs..."
last_log_index = 0
# Then start monitoring for new logs
while self.is_running():
with self._log_lock:
# Check if we have new logs
if len(self._log_buffer) > last_log_index:
# Yield only the new logs
new_logs = self._log_buffer[last_log_index:]
yield "\n".join(new_logs)
last_log_index = len(self._log_buffer)
# Wait a bit before checking again
await asyncio.sleep(0.1)
# Final check for any logs that came in during shutdown
with self._log_lock:
if len(self._log_buffer) > last_log_index:
yield "\n".join(self._log_buffer[last_log_index:])