tasks work post refactor

This commit is contained in:
estevez.sebastian@gmail.com 2025-07-30 15:49:58 -04:00
parent 13e4b971f1
commit 4d8748ec75
8 changed files with 188 additions and 119 deletions

View file

@ -96,9 +96,6 @@ class AppClients:
# Initialize patched OpenAI client # Initialize patched OpenAI client
self.patched_async_client = patch_openai_with_mcp(AsyncOpenAI()) self.patched_async_client = patch_openai_with_mcp(AsyncOpenAI())
# Initialize Docling converter
self.converter = DocumentConverter()
return self return self
# Global clients instance # Global clients instance

View file

@ -11,12 +11,13 @@ from .connection_manager import ConnectionManager
class ConnectorService: class ConnectorService:
"""Service to manage document connectors and process files""" """Service to manage document connectors and process files"""
def __init__(self, opensearch_client, patched_async_client, process_pool, embed_model: str, index_name: str): def __init__(self, opensearch_client, patched_async_client, process_pool, embed_model: str, index_name: str, task_service=None):
self.opensearch = opensearch_client self.opensearch = opensearch_client
self.openai_client = patched_async_client self.openai_client = patched_async_client
self.process_pool = process_pool self.process_pool = process_pool
self.embed_model = embed_model self.embed_model = embed_model
self.index_name = index_name self.index_name = index_name
self.task_service = task_service
self.connection_manager = ConnectionManager() self.connection_manager = ConnectionManager()
async def initialize(self): async def initialize(self):
@ -113,6 +114,9 @@ class ConnectorService:
async def sync_connector_files(self, connection_id: str, user_id: str, max_files: int = None) -> str: async def sync_connector_files(self, connection_id: str, user_id: str, max_files: int = None) -> str:
"""Sync files from a connector connection using existing task tracking system""" """Sync files from a connector connection using existing task tracking system"""
if not self.task_service:
raise ValueError("TaskService not available - connector sync requires task service dependency")
print(f"[DEBUG] Starting sync for connection {connection_id}, max_files={max_files}") print(f"[DEBUG] Starting sync for connection {connection_id}, max_files={max_files}")
connector = await self.get_connector(connection_id) connector = await self.get_connector(connection_id)
@ -155,104 +159,14 @@ class ConnectorService:
if not files_to_process: if not files_to_process:
raise ValueError("No files found to sync") raise ValueError("No files found to sync")
# Create upload task using existing task system # Create custom processor for connector files
import uuid from models.processors import ConnectorFileProcessor
from app import UploadTask, FileTask, TaskStatus, task_store, background_upload_processor processor = ConnectorFileProcessor(self, connection_id, files_to_process)
task_id = str(uuid.uuid4()) # Use file IDs as items (no more fake file paths!)
upload_task = UploadTask( file_ids = [file_info['id'] for file_info in files_to_process]
task_id=task_id,
total_files=len(files_to_process),
file_tasks={f"connector_file_{file_info['id']}": FileTask(file_path=f"connector_file_{file_info['id']}") for file_info in files_to_process}
)
# Store task for user # Create custom task using TaskService
if user_id not in task_store: task_id = await self.task_service.create_custom_task(user_id, file_ids, processor)
task_store[user_id] = {}
task_store[user_id][task_id] = upload_task
# Start background processing with connector-specific logic return task_id
import asyncio
from app import background_tasks
background_task = asyncio.create_task(self._background_connector_sync(user_id, task_id, connection_id, files_to_process))
background_tasks.add(background_task)
background_task.add_done_callback(background_tasks.discard)
return task_id
async def _background_connector_sync(self, user_id: str, task_id: str, connection_id: str, files_to_process: List[Dict]):
"""Background task to sync connector files"""
from app import task_store, TaskStatus
import datetime
try:
upload_task = task_store[user_id][task_id]
upload_task.status = TaskStatus.RUNNING
upload_task.updated_at = datetime.datetime.now().timestamp()
connector = await self.get_connector(connection_id)
if not connector:
raise ValueError(f"Connection '{connection_id}' not found")
# Process files with limited concurrency
semaphore = asyncio.Semaphore(4) # Limit concurrent file processing
async def process_connector_file(file_info):
async with semaphore:
file_key = f"connector_file_{file_info['id']}"
file_task = upload_task.file_tasks[file_key]
file_task.status = TaskStatus.RUNNING
file_task.updated_at = datetime.datetime.now().timestamp()
try:
# Get file content from connector
document = await connector.get_file_content(file_info['id'])
# Process using existing pipeline
result = await self.process_connector_document(document, user_id)
file_task.status = TaskStatus.COMPLETED
file_task.result = result
upload_task.successful_files += 1
except Exception as e:
import sys
import traceback
error_msg = f"[ERROR] Failed to process connector file {file_info['id']}: {e}"
print(error_msg, file=sys.stderr, flush=True)
traceback.print_exc(file=sys.stderr)
sys.stderr.flush()
# Also store full traceback in task error
full_error = f"{str(e)}\n{traceback.format_exc()}"
file_task.status = TaskStatus.FAILED
file_task.error = full_error
upload_task.failed_files += 1
finally:
file_task.updated_at = datetime.datetime.now().timestamp()
upload_task.processed_files += 1
upload_task.updated_at = datetime.datetime.now().timestamp()
# Process all files concurrently
tasks = [process_connector_file(file_info) for file_info in files_to_process]
await asyncio.gather(*tasks, return_exceptions=True)
# Update connection last sync time
await self.connection_manager.update_last_sync(connection_id)
upload_task.status = TaskStatus.COMPLETED
upload_task.updated_at = datetime.datetime.now().timestamp()
except Exception as e:
import sys
import traceback
error_msg = f"[ERROR] Background connector sync failed for task {task_id}: {e}"
print(error_msg, file=sys.stderr, flush=True)
traceback.print_exc(file=sys.stderr)
sys.stderr.flush()
if user_id in task_store and task_id in task_store[user_id]:
task_store[user_id][task_id].status = TaskStatus.FAILED
task_store[user_id][task_id].updated_at = datetime.datetime.now().timestamp()

View file

@ -1,10 +1,18 @@
import asyncio import asyncio
import atexit import atexit
import torch import multiprocessing
from functools import partial from functools import partial
from starlette.applications import Starlette from starlette.applications import Starlette
from starlette.routing import Route from starlette.routing import Route
# Set multiprocessing start method to 'spawn' for CUDA compatibility
multiprocessing.set_start_method('spawn', force=True)
# Create process pool FIRST, before any torch/CUDA imports
from utils.process_pool import process_pool
import torch
# Configuration and setup # Configuration and setup
from config.settings import clients, INDEX_NAME, INDEX_BODY, SESSION_SECRET from config.settings import clients, INDEX_NAME, INDEX_BODY, SESSION_SECRET
from utils.gpu_detection import detect_gpu_devices from utils.gpu_detection import detect_gpu_devices
@ -65,19 +73,20 @@ def initialize_services():
# Initialize services # Initialize services
document_service = DocumentService() document_service = DocumentService()
search_service = SearchService() search_service = SearchService()
task_service = TaskService(document_service) task_service = TaskService(document_service, process_pool)
chat_service = ChatService() chat_service = ChatService()
# Set process pool for document service # Set process pool for document service
document_service.process_pool = task_service.process_pool document_service.process_pool = process_pool
# Initialize connector service # Initialize connector service
connector_service = ConnectorService( connector_service = ConnectorService(
opensearch_client=clients.opensearch, opensearch_client=clients.opensearch,
patched_async_client=clients.patched_async_client, patched_async_client=clients.patched_async_client,
process_pool=task_service.process_pool, process_pool=process_pool,
embed_model="text-embedding-3-small", embed_model="text-embedding-3-small",
index_name=INDEX_NAME index_name=INDEX_NAME,
task_service=task_service
) )
# Initialize auth service # Initialize auth service

78
src/models/processors.py Normal file
View file

@ -0,0 +1,78 @@
from abc import ABC, abstractmethod
from typing import Any, Dict
from .tasks import UploadTask, FileTask
class TaskProcessor(ABC):
"""Abstract base class for task processors"""
@abstractmethod
async def process_item(self, upload_task: UploadTask, item: Any, file_task: FileTask) -> None:
"""
Process a single item in the task.
Args:
upload_task: The overall upload task
item: The item to process (could be file path, file info, etc.)
file_task: The specific file task to update
"""
pass
class DocumentFileProcessor(TaskProcessor):
"""Default processor for regular file uploads"""
def __init__(self, document_service):
self.document_service = document_service
async def process_item(self, upload_task: UploadTask, item: str, file_task: FileTask) -> None:
"""Process a regular file path using DocumentService"""
# This calls the existing logic
await self.document_service.process_single_file_task(upload_task, item)
class ConnectorFileProcessor(TaskProcessor):
"""Processor for connector file uploads"""
def __init__(self, connector_service, connection_id: str, files_to_process: list):
self.connector_service = connector_service
self.connection_id = connection_id
self.files_to_process = files_to_process
# Create lookup map for file info
self.file_info_map = {f['id']: f for f in files_to_process}
async def process_item(self, upload_task: UploadTask, item: str, file_task: FileTask) -> None:
"""Process a connector file using ConnectorService"""
from models.tasks import TaskStatus
import time
file_id = item # item is the connector file ID
file_info = self.file_info_map.get(file_id)
if not file_info:
raise ValueError(f"File info not found for {file_id}")
# Get the connector
connector = await self.connector_service.get_connector(self.connection_id)
if not connector:
raise ValueError(f"Connection '{self.connection_id}' not found")
# Get file content from connector
document = await connector.get_file_content(file_info['id'])
# Get user_id from task store lookup
user_id = None
for uid, tasks in self.connector_service.task_service.task_store.items():
if upload_task.task_id in tasks:
user_id = uid
break
if not user_id:
raise ValueError("Could not determine user_id for task")
# Process using existing pipeline
result = await self.connector_service.process_connector_document(document, user_id)
file_task.status = TaskStatus.COMPLETED
file_task.result = result
upload_task.successful_files += 1

View file

@ -132,7 +132,7 @@ class DocumentService:
file_task.updated_at = time.time() file_task.updated_at = time.time()
try: try:
# Check if file already exists in index # Handle regular file processing
loop = asyncio.get_event_loop() loop = asyncio.get_event_loop()
# Run CPU-intensive docling processing in separate process # Run CPU-intensive docling processing in separate process

View file

@ -3,21 +3,21 @@ import uuid
import time import time
import random import random
from typing import Dict from typing import Dict
from concurrent.futures import ProcessPoolExecutor
from models.tasks import TaskStatus, UploadTask, FileTask from models.tasks import TaskStatus, UploadTask, FileTask
from utils.gpu_detection import get_worker_count
from src.utils.gpu_detection import get_worker_count
class TaskService: class TaskService:
def __init__(self, document_service=None): def __init__(self, document_service=None, process_pool=None):
self.document_service = document_service self.document_service = document_service
self.process_pool = process_pool
self.task_store: Dict[str, Dict[str, UploadTask]] = {} # user_id -> {task_id -> UploadTask} self.task_store: Dict[str, Dict[str, UploadTask]] = {} # user_id -> {task_id -> UploadTask}
self.background_tasks = set() self.background_tasks = set()
# Initialize process pool if self.process_pool is None:
max_workers = get_worker_count() raise ValueError("TaskService requires a process_pool parameter")
self.process_pool = ProcessPoolExecutor(max_workers=max_workers)
print(f"Process pool initialized with {max_workers} workers")
async def exponential_backoff_delay(self, retry_count: int, base_delay: float = 1.0, max_delay: float = 60.0) -> None: async def exponential_backoff_delay(self, retry_count: int, base_delay: float = 1.0, max_delay: float = 60.0) -> None:
"""Apply exponential backoff with jitter""" """Apply exponential backoff with jitter"""
@ -26,19 +26,29 @@ class TaskService:
async def create_upload_task(self, user_id: str, file_paths: list) -> str: async def create_upload_task(self, user_id: str, file_paths: list) -> str:
"""Create a new upload task for bulk file processing""" """Create a new upload task for bulk file processing"""
# Use default DocumentFileProcessor
from models.processors import DocumentFileProcessor
processor = DocumentFileProcessor(self.document_service)
return await self.create_custom_task(user_id, file_paths, processor)
async def create_custom_task(self, user_id: str, items: list, processor) -> str:
"""Create a new task with custom processor for any type of items"""
task_id = str(uuid.uuid4()) task_id = str(uuid.uuid4())
upload_task = UploadTask( upload_task = UploadTask(
task_id=task_id, task_id=task_id,
total_files=len(file_paths), total_files=len(items),
file_tasks={path: FileTask(file_path=path) for path in file_paths} file_tasks={str(item): FileTask(file_path=str(item)) for item in items}
) )
# Attach the custom processor to the task
upload_task.processor = processor
if user_id not in self.task_store: if user_id not in self.task_store:
self.task_store[user_id] = {} self.task_store[user_id] = {}
self.task_store[user_id][task_id] = upload_task self.task_store[user_id][task_id] = upload_task
# Start background processing # Start background processing
background_task = asyncio.create_task(self.background_upload_processor(user_id, task_id)) background_task = asyncio.create_task(self.background_custom_processor(user_id, task_id, items))
self.background_tasks.add(background_task) self.background_tasks.add(background_task)
background_task.add_done_callback(self.background_tasks.discard) background_task.add_done_callback(self.background_tasks.discard)
@ -74,6 +84,58 @@ class TaskService:
self.task_store[user_id][task_id].status = TaskStatus.FAILED self.task_store[user_id][task_id].status = TaskStatus.FAILED
self.task_store[user_id][task_id].updated_at = time.time() self.task_store[user_id][task_id].updated_at = time.time()
async def background_custom_processor(self, user_id: str, task_id: str, items: list) -> None:
"""Background task to process items using custom processor"""
try:
upload_task = self.task_store[user_id][task_id]
upload_task.status = TaskStatus.RUNNING
upload_task.updated_at = time.time()
processor = upload_task.processor
# Process items with limited concurrency
max_workers = get_worker_count()
semaphore = asyncio.Semaphore(max_workers * 2)
async def process_with_semaphore(item, item_key: str):
async with semaphore:
file_task = upload_task.file_tasks[item_key]
file_task.status = TaskStatus.RUNNING
file_task.updated_at = time.time()
try:
await processor.process_item(upload_task, item, file_task)
except Exception as e:
print(f"[ERROR] Failed to process item {item}: {e}")
import traceback
traceback.print_exc()
file_task.status = TaskStatus.FAILED
file_task.error = str(e)
upload_task.failed_files += 1
finally:
file_task.updated_at = time.time()
upload_task.processed_files += 1
upload_task.updated_at = time.time()
tasks = [
process_with_semaphore(item, str(item))
for item in items
]
await asyncio.gather(*tasks, return_exceptions=True)
# Mark task as completed
upload_task.status = TaskStatus.COMPLETED
upload_task.updated_at = time.time()
except Exception as e:
print(f"[ERROR] Background custom processor failed for task {task_id}: {e}")
import traceback
traceback.print_exc()
if user_id in self.task_store and task_id in self.task_store[user_id]:
self.task_store[user_id][task_id].status = TaskStatus.FAILED
self.task_store[user_id][task_id].updated_at = time.time()
def get_task_status(self, user_id: str, task_id: str) -> dict: def get_task_status(self, user_id: str, task_id: str) -> dict:
"""Get the status of a specific upload task""" """Get the status of a specific upload task"""
if (not task_id or if (not task_id or

View file

@ -1,7 +1,6 @@
import hashlib import hashlib
import os import os
from collections import defaultdict from collections import defaultdict
from docling.document_converter import DocumentConverter
from .gpu_detection import detect_gpu_devices from .gpu_detection import detect_gpu_devices
# Global converter cache for worker processes # Global converter cache for worker processes

10
src/utils/process_pool.py Normal file
View file

@ -0,0 +1,10 @@
import os
from concurrent.futures import ProcessPoolExecutor
from utils.gpu_detection import get_worker_count
# Create shared process pool at import time (before CUDA initialization)
# This avoids the "Cannot re-initialize CUDA in forked subprocess" error
MAX_WORKERS = get_worker_count()
process_pool = ProcessPoolExecutor(max_workers=MAX_WORKERS)
print(f"Shared process pool initialized with {MAX_WORKERS} workers")