tasks work post refactor

2025-07-30 15:49:58 -04:00 · 2025-07-30 15:49:58 -04:00 · 4d8748ec75
commit 4d8748ec75
parent 13e4b971f1
8 changed files with 188 additions and 119 deletions
--- a/src/config/settings.py
+++ b/src/config/settings.py
@ -96,9 +96,6 @@ class AppClients:
        # Initialize patched OpenAI client
        self.patched_async_client = patch_openai_with_mcp(AsyncOpenAI())
        # Initialize Docling converter
        self.converter = DocumentConverter()
        return self
 # Global clients instance
--- a/src/connectors/service.py
+++ b/src/connectors/service.py
@ -11,12 +11,13 @@ from .connection_manager import ConnectionManager
 class ConnectorService:
    """Service to manage document connectors and process files"""
-    def __init__(self, opensearch_client, patched_async_client, process_pool, embed_model: str, index_name: str):
+    def __init__(self, opensearch_client, patched_async_client, process_pool, embed_model: str, index_name: str, task_service=None):
        self.opensearch = opensearch_client
        self.openai_client = patched_async_client
        self.process_pool = process_pool
        self.embed_model = embed_model
        self.index_name = index_name
        self.task_service = task_service
        self.connection_manager = ConnectionManager()
    async def initialize(self):
@ -113,6 +114,9 @@ class ConnectorService:
    async def sync_connector_files(self, connection_id: str, user_id: str, max_files: int = None) -> str:
        """Sync files from a connector connection using existing task tracking system"""
        if not self.task_service:
            raise ValueError("TaskService not available - connector sync requires task service dependency")
        print(f"[DEBUG] Starting sync for connection {connection_id}, max_files={max_files}")
        connector = await self.get_connector(connection_id)
@ -155,104 +159,14 @@ class ConnectorService:
        if not files_to_process:
            raise ValueError("No files found to sync")
-        # Create upload task using existing task system
+        # Create custom processor for connector files
-        import uuid
+        from models.processors import ConnectorFileProcessor
-        from app import UploadTask, FileTask, TaskStatus, task_store, background_upload_processor
+        processor = ConnectorFileProcessor(self, connection_id, files_to_process)
-        task_id = str(uuid.uuid4())
+        # Use file IDs as items (no more fake file paths!)
-        upload_task = UploadTask(
+        file_ids = [file_info['id'] for file_info in files_to_process]
            task_id=task_id,
            total_files=len(files_to_process),
            file_tasks={f"connector_file_{file_info['id']}": FileTask(file_path=f"connector_file_{file_info['id']}") for file_info in files_to_process}
        )
-        # Store task for user
+        # Create custom task using TaskService
-        if user_id not in task_store:
+        task_id = await self.task_service.create_custom_task(user_id, file_ids, processor)
            task_store[user_id] = {}
        task_store[user_id][task_id] = upload_task
-        # Start background processing with connector-specific logic
+        return task_id
        import asyncio
        from app import background_tasks
        background_task = asyncio.create_task(self._background_connector_sync(user_id, task_id, connection_id, files_to_process))
        background_tasks.add(background_task)
        background_task.add_done_callback(background_tasks.discard)
        return task_id
    async def _background_connector_sync(self, user_id: str, task_id: str, connection_id: str, files_to_process: List[Dict]):
        """Background task to sync connector files"""
        from app import task_store, TaskStatus
        import datetime
        try:
            upload_task = task_store[user_id][task_id]
            upload_task.status = TaskStatus.RUNNING
            upload_task.updated_at = datetime.datetime.now().timestamp()
            connector = await self.get_connector(connection_id)
            if not connector:
                raise ValueError(f"Connection '{connection_id}' not found")
            # Process files with limited concurrency
            semaphore = asyncio.Semaphore(4)  # Limit concurrent file processing
            async def process_connector_file(file_info):
                async with semaphore:
                    file_key = f"connector_file_{file_info['id']}"
                    file_task = upload_task.file_tasks[file_key]
                    file_task.status = TaskStatus.RUNNING
                    file_task.updated_at = datetime.datetime.now().timestamp()
                    try:
                        # Get file content from connector
                        document = await connector.get_file_content(file_info['id'])
                        # Process using existing pipeline
                        result = await self.process_connector_document(document, user_id)
                        file_task.status = TaskStatus.COMPLETED
                        file_task.result = result
                        upload_task.successful_files += 1
                    except Exception as e:
                        import sys
                        import traceback
                        error_msg = f"[ERROR] Failed to process connector file {file_info['id']}: {e}"
                        print(error_msg, file=sys.stderr, flush=True)
                        traceback.print_exc(file=sys.stderr)
                        sys.stderr.flush()
                        # Also store full traceback in task error
                        full_error = f"{str(e)}\n{traceback.format_exc()}"
                        file_task.status = TaskStatus.FAILED
                        file_task.error = full_error
                        upload_task.failed_files += 1
                    finally:
                        file_task.updated_at = datetime.datetime.now().timestamp()
                        upload_task.processed_files += 1
                        upload_task.updated_at = datetime.datetime.now().timestamp()
            # Process all files concurrently
            tasks = [process_connector_file(file_info) for file_info in files_to_process]
            await asyncio.gather(*tasks, return_exceptions=True)
            # Update connection last sync time
            await self.connection_manager.update_last_sync(connection_id)
            upload_task.status = TaskStatus.COMPLETED
            upload_task.updated_at = datetime.datetime.now().timestamp()
        except Exception as e:
            import sys
            import traceback
            error_msg = f"[ERROR] Background connector sync failed for task {task_id}: {e}"
            print(error_msg, file=sys.stderr, flush=True)
            traceback.print_exc(file=sys.stderr)
            sys.stderr.flush()
            if user_id in task_store and task_id in task_store[user_id]:
                task_store[user_id][task_id].status = TaskStatus.FAILED
                task_store[user_id][task_id].updated_at = datetime.datetime.now().timestamp()
--- a/src/main.py
+++ b/src/main.py
@ -1,10 +1,18 @@
 import asyncio
 import atexit
-import torch
+import multiprocessing
 from functools import partial
 from starlette.applications import Starlette
 from starlette.routing import Route
 # Set multiprocessing start method to 'spawn' for CUDA compatibility
 multiprocessing.set_start_method('spawn', force=True)
 # Create process pool FIRST, before any torch/CUDA imports
 from utils.process_pool import process_pool
 import torch
 # Configuration and setup
 from config.settings import clients, INDEX_NAME, INDEX_BODY, SESSION_SECRET
 from utils.gpu_detection import detect_gpu_devices
@ -65,19 +73,20 @@ def initialize_services():
    # Initialize services
    document_service = DocumentService()
    search_service = SearchService()
-    task_service = TaskService(document_service)
+    task_service = TaskService(document_service, process_pool)
    chat_service = ChatService()
    # Set process pool for document service
-    document_service.process_pool = task_service.process_pool
+    document_service.process_pool = process_pool
    # Initialize connector service
    connector_service = ConnectorService(
        opensearch_client=clients.opensearch,
        patched_async_client=clients.patched_async_client,
-        process_pool=task_service.process_pool,
+        process_pool=process_pool,
        embed_model="text-embedding-3-small",
-        index_name=INDEX_NAME
+        index_name=INDEX_NAME,
        task_service=task_service
    )
    # Initialize auth service
--- a/src/models/processors.py
+++ b/src/models/processors.py
@ -0,0 +1,78 @@
 from abc import ABC, abstractmethod
 from typing import Any, Dict
 from .tasks import UploadTask, FileTask
 class TaskProcessor(ABC):
    """Abstract base class for task processors"""
    @abstractmethod
    async def process_item(self, upload_task: UploadTask, item: Any, file_task: FileTask) -> None:
        """
        Process a single item in the task.
        Args:
            upload_task: The overall upload task
            item: The item to process (could be file path, file info, etc.)
            file_task: The specific file task to update
        """
        pass
 class DocumentFileProcessor(TaskProcessor):
    """Default processor for regular file uploads"""
    def __init__(self, document_service):
        self.document_service = document_service
    async def process_item(self, upload_task: UploadTask, item: str, file_task: FileTask) -> None:
        """Process a regular file path using DocumentService"""
        # This calls the existing logic
        await self.document_service.process_single_file_task(upload_task, item)
 class ConnectorFileProcessor(TaskProcessor):
    """Processor for connector file uploads"""
    def __init__(self, connector_service, connection_id: str, files_to_process: list):
        self.connector_service = connector_service
        self.connection_id = connection_id
        self.files_to_process = files_to_process
        # Create lookup map for file info
        self.file_info_map = {f['id']: f for f in files_to_process}
    async def process_item(self, upload_task: UploadTask, item: str, file_task: FileTask) -> None:
        """Process a connector file using ConnectorService"""
        from models.tasks import TaskStatus
        import time
        file_id = item  # item is the connector file ID
        file_info = self.file_info_map.get(file_id)
        if not file_info:
            raise ValueError(f"File info not found for {file_id}")
        # Get the connector
        connector = await self.connector_service.get_connector(self.connection_id)
        if not connector:
            raise ValueError(f"Connection '{self.connection_id}' not found")
        # Get file content from connector
        document = await connector.get_file_content(file_info['id'])
        # Get user_id from task store lookup
        user_id = None
        for uid, tasks in self.connector_service.task_service.task_store.items():
            if upload_task.task_id in tasks:
                user_id = uid
                break
        if not user_id:
            raise ValueError("Could not determine user_id for task")
        # Process using existing pipeline
        result = await self.connector_service.process_connector_document(document, user_id)
        file_task.status = TaskStatus.COMPLETED
        file_task.result = result
        upload_task.successful_files += 1
--- a/src/services/document_service.py
+++ b/src/services/document_service.py
@ -132,7 +132,7 @@ class DocumentService:
        file_task.updated_at = time.time()
        try:
-            # Check if file already exists in index
+            # Handle regular file processing
            loop = asyncio.get_event_loop()
            # Run CPU-intensive docling processing in separate process
--- a/src/services/task_service.py
+++ b/src/services/task_service.py
@ -3,21 +3,21 @@ import uuid
 import time
 import random
 from typing import Dict
 from concurrent.futures import ProcessPoolExecutor
 from models.tasks import TaskStatus, UploadTask, FileTask
-from utils.gpu_detection import get_worker_count
+
 from src.utils.gpu_detection import get_worker_count
 class TaskService:
-    def __init__(self, document_service=None):
+    def __init__(self, document_service=None, process_pool=None):
        self.document_service = document_service
        self.process_pool = process_pool
        self.task_store: Dict[str, Dict[str, UploadTask]] = {}  # user_id -> {task_id -> UploadTask}
        self.background_tasks = set()
-        # Initialize process pool
+        if self.process_pool is None:
-        max_workers = get_worker_count()
+            raise ValueError("TaskService requires a process_pool parameter")
        self.process_pool = ProcessPoolExecutor(max_workers=max_workers)
        print(f"Process pool initialized with {max_workers} workers")
    async def exponential_backoff_delay(self, retry_count: int, base_delay: float = 1.0, max_delay: float = 60.0) -> None:
        """Apply exponential backoff with jitter"""
@ -26,19 +26,29 @@ class TaskService:
    async def create_upload_task(self, user_id: str, file_paths: list) -> str:
        """Create a new upload task for bulk file processing"""
        # Use default DocumentFileProcessor
        from models.processors import DocumentFileProcessor
        processor = DocumentFileProcessor(self.document_service)
        return await self.create_custom_task(user_id, file_paths, processor)
    async def create_custom_task(self, user_id: str, items: list, processor) -> str:
        """Create a new task with custom processor for any type of items"""
        task_id = str(uuid.uuid4())
        upload_task = UploadTask(
            task_id=task_id,
-            total_files=len(file_paths),
+            total_files=len(items),
-            file_tasks={path: FileTask(file_path=path) for path in file_paths}
+            file_tasks={str(item): FileTask(file_path=str(item)) for item in items}
        )
        # Attach the custom processor to the task
        upload_task.processor = processor
        if user_id not in self.task_store:
            self.task_store[user_id] = {}
        self.task_store[user_id][task_id] = upload_task
        # Start background processing
-        background_task = asyncio.create_task(self.background_upload_processor(user_id, task_id))
+        background_task = asyncio.create_task(self.background_custom_processor(user_id, task_id, items))
        self.background_tasks.add(background_task)
        background_task.add_done_callback(self.background_tasks.discard)
@ -74,6 +84,58 @@ class TaskService:
                self.task_store[user_id][task_id].status = TaskStatus.FAILED
                self.task_store[user_id][task_id].updated_at = time.time()
    async def background_custom_processor(self, user_id: str, task_id: str, items: list) -> None:
        """Background task to process items using custom processor"""
        try:
            upload_task = self.task_store[user_id][task_id]
            upload_task.status = TaskStatus.RUNNING
            upload_task.updated_at = time.time()
            processor = upload_task.processor
            # Process items with limited concurrency
            max_workers = get_worker_count()
            semaphore = asyncio.Semaphore(max_workers * 2)
            async def process_with_semaphore(item, item_key: str):
                async with semaphore:
                    file_task = upload_task.file_tasks[item_key]
                    file_task.status = TaskStatus.RUNNING
                    file_task.updated_at = time.time()
                    try:
                        await processor.process_item(upload_task, item, file_task)
                    except Exception as e:
                        print(f"[ERROR] Failed to process item {item}: {e}")
                        import traceback
                        traceback.print_exc()
                        file_task.status = TaskStatus.FAILED
                        file_task.error = str(e)
                        upload_task.failed_files += 1
                    finally:
                        file_task.updated_at = time.time()
                        upload_task.processed_files += 1
                        upload_task.updated_at = time.time()
            tasks = [
                process_with_semaphore(item, str(item))
                for item in items
            ]
            await asyncio.gather(*tasks, return_exceptions=True)
            # Mark task as completed
            upload_task.status = TaskStatus.COMPLETED
            upload_task.updated_at = time.time()
        except Exception as e:
            print(f"[ERROR] Background custom processor failed for task {task_id}: {e}")
            import traceback
            traceback.print_exc()
            if user_id in self.task_store and task_id in self.task_store[user_id]:
                self.task_store[user_id][task_id].status = TaskStatus.FAILED
                self.task_store[user_id][task_id].updated_at = time.time()
    def get_task_status(self, user_id: str, task_id: str) -> dict:
        """Get the status of a specific upload task"""
        if (not task_id or 
--- a/src/utils/document_processing.py
+++ b/src/utils/document_processing.py
@ -1,7 +1,6 @@
 import hashlib
 import os
 from collections import defaultdict
 from docling.document_converter import DocumentConverter
 from .gpu_detection import detect_gpu_devices
 # Global converter cache for worker processes
--- a/src/utils/process_pool.py
+++ b/src/utils/process_pool.py
@ -0,0 +1,10 @@
 import os
 from concurrent.futures import ProcessPoolExecutor
 from utils.gpu_detection import get_worker_count
 # Create shared process pool at import time (before CUDA initialization)
 # This avoids the "Cannot re-initialize CUDA in forked subprocess" error
 MAX_WORKERS = get_worker_count()
 process_pool = ProcessPoolExecutor(max_workers=MAX_WORKERS)
 print(f"Shared process pool initialized with {MAX_WORKERS} workers")