import asyncio import tempfile import os from typing import Dict, Any, List, Optional from .base import BaseConnector, ConnectorDocument from .google_drive import GoogleDriveConnector from .connection_manager import ConnectionManager class ConnectorService: """Service to manage document connectors and process files""" def __init__(self, opensearch_client, patched_async_client, process_pool, embed_model: str, index_name: str, task_service=None): self.opensearch = opensearch_client self.openai_client = patched_async_client self.process_pool = process_pool self.embed_model = embed_model self.index_name = index_name self.task_service = task_service self.connection_manager = ConnectionManager() async def initialize(self): """Initialize the service by loading existing connections""" await self.connection_manager.load_connections() async def get_connector(self, connection_id: str) -> Optional[BaseConnector]: """Get a connector by connection ID""" return await self.connection_manager.get_connector(connection_id) async def process_connector_document(self, document: ConnectorDocument, owner_user_id: str) -> Dict[str, Any]: """Process a document from a connector using existing processing pipeline""" # Create temporary file from document content with tempfile.NamedTemporaryFile(delete=False, suffix=self._get_file_extension(document.mimetype)) as tmp_file: tmp_file.write(document.content) tmp_file.flush() try: # Use existing process_file_common function with connector document metadata # We'll use the document service's process_file_common method from services.document_service import DocumentService doc_service = DocumentService() # Process using the existing pipeline but with connector document metadata result = await doc_service.process_file_common( file_path=tmp_file.name, file_hash=document.id, # Use connector document ID as hash owner_user_id=owner_user_id ) # If successfully indexed, update the indexed documents with connector metadata if result["status"] == "indexed": # Update all chunks with connector-specific metadata await self._update_connector_metadata(document, owner_user_id) return { **result, "filename": document.filename, "source_url": document.source_url } finally: # Clean up temporary file os.unlink(tmp_file.name) async def _update_connector_metadata(self, document: ConnectorDocument, owner_user_id: str): """Update indexed chunks with connector-specific metadata""" # Find all chunks for this document query = { "query": { "term": {"document_id": document.id} } } response = await self.opensearch.search(index=self.index_name, body=query) # Update each chunk with connector metadata for hit in response["hits"]["hits"]: chunk_id = hit["_id"] update_body = { "doc": { "source_url": document.source_url, "connector_type": "google_drive", # Could be passed as parameter # Additional ACL info beyond owner (already set by process_file_common) "allowed_users": document.acl.allowed_users, "allowed_groups": document.acl.allowed_groups, "user_permissions": document.acl.user_permissions, "group_permissions": document.acl.group_permissions, # Timestamps "created_time": document.created_time.isoformat(), "modified_time": document.modified_time.isoformat(), # Additional metadata "metadata": document.metadata } } await self.opensearch.update(index=self.index_name, id=chunk_id, body=update_body) def _get_file_extension(self, mimetype: str) -> str: """Get file extension based on MIME type""" mime_to_ext = { 'application/pdf': '.pdf', 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '.docx', 'application/msword': '.doc', 'application/vnd.openxmlformats-officedocument.presentationml.presentation': '.pptx', 'application/vnd.ms-powerpoint': '.ppt', 'text/plain': '.txt', 'text/html': '.html', 'application/rtf': '.rtf', 'application/vnd.google-apps.document': '.pdf', # Exported as PDF 'application/vnd.google-apps.presentation': '.pdf', 'application/vnd.google-apps.spreadsheet': '.pdf', } return mime_to_ext.get(mimetype, '.bin') async def sync_connector_files(self, connection_id: str, user_id: str, max_files: int = None) -> str: """Sync files from a connector connection using existing task tracking system""" if not self.task_service: raise ValueError("TaskService not available - connector sync requires task service dependency") print(f"[DEBUG] Starting sync for connection {connection_id}, max_files={max_files}") connector = await self.get_connector(connection_id) if not connector: raise ValueError(f"Connection '{connection_id}' not found or not authenticated") print(f"[DEBUG] Got connector, authenticated: {connector.is_authenticated}") if not connector.is_authenticated: raise ValueError(f"Connection '{connection_id}' not authenticated") # Collect files to process (limited by max_files) files_to_process = [] page_token = None # Calculate page size to minimize API calls page_size = min(max_files or 100, 1000) if max_files else 100 while True: # List files from connector with limit print(f"[DEBUG] Calling list_files with page_size={page_size}, page_token={page_token}") file_list = await connector.list_files(page_token, limit=page_size) print(f"[DEBUG] Got {len(file_list.get('files', []))} files") files = file_list['files'] if not files: break for file_info in files: if max_files and len(files_to_process) >= max_files: break files_to_process.append(file_info) # Stop if we have enough files or no more pages if (max_files and len(files_to_process) >= max_files) or not file_list.get('nextPageToken'): break page_token = file_list.get('nextPageToken') if not files_to_process: raise ValueError("No files found to sync") # Create custom processor for connector files from models.processors import ConnectorFileProcessor processor = ConnectorFileProcessor(self, connection_id, files_to_process, user_id) # Use file IDs as items (no more fake file paths!) file_ids = [file_info['id'] for file_info in files_to_process] # Create custom task using TaskService task_id = await self.task_service.create_custom_task(user_id, file_ids, processor) return task_id async def sync_specific_files(self, connection_id: str, user_id: str, file_ids: List[str]) -> str: """Sync specific files by their IDs (used for webhook-triggered syncs)""" if not self.task_service: raise ValueError("TaskService not available - connector sync requires task service dependency") connector = await self.get_connector(connection_id) if not connector: raise ValueError(f"Connection '{connection_id}' not found or not authenticated") if not connector.is_authenticated: raise ValueError(f"Connection '{connection_id}' not authenticated") if not file_ids: raise ValueError("No file IDs provided") # Create custom processor for specific connector files from models.processors import ConnectorFileProcessor # We'll pass file_ids as the files_info, the processor will handle ID-only files processor = ConnectorFileProcessor(self, connection_id, file_ids, user_id) # Create custom task using TaskService task_id = await self.task_service.create_custom_task(user_id, file_ids, processor) return task_id async def _get_connector(self, connection_id: str) -> Optional[BaseConnector]: """Get a connector by connection ID (alias for get_connector)""" return await self.get_connector(connection_id)