openrag/src/connectors/service.py

from typing import Any, Dict, List, Optional

from utils.logging_config import get_logger

from .base import BaseConnector, ConnectorDocument
from .connection_manager import ConnectionManager


logger = get_logger(__name__)


class ConnectorService:
    """Service to manage document connectors and process files"""

    def __init__(
        self,
        patched_async_client,
        process_pool,
        embed_model: str,
        index_name: str,
        task_service=None,
        session_manager=None,
    ):
        self.clients = patched_async_client  # Store the clients object to access the property
        self.process_pool = process_pool
        self.embed_model = embed_model
        self.index_name = index_name
        self.task_service = task_service
        self.session_manager = session_manager
        self.connection_manager = ConnectionManager()

    async def initialize(self):
        """Initialize the service by loading existing connections"""
        await self.connection_manager.load_connections()

    async def get_connector(self, connection_id: str) -> Optional[BaseConnector]:
        """Get a connector by connection ID"""
        return await self.connection_manager.get_connector(connection_id)

    async def process_connector_document(
        self,
        document: ConnectorDocument,
        owner_user_id: str,
        connector_type: str,
        jwt_token: str = None,
        owner_name: str = None,
        owner_email: str = None,
    ) -> Dict[str, Any]:
        """Process a document from a connector using existing processing pipeline"""

        # Create temporary file from document content
        from utils.file_utils import auto_cleanup_tempfile

        with auto_cleanup_tempfile(
            suffix=self._get_file_extension(document.mimetype)
        ) as tmp_path:
            # Write document content to temp file
            with open(tmp_path, "wb") as f:
                f.write(document.content)

            # Use existing process_file_common function with connector document metadata
            # We'll use the document service's process_file_common method
            from services.document_service import DocumentService

            doc_service = DocumentService(session_manager=self.session_manager)

            logger.debug("Processing connector document", document_id=document.id)

            # Process using consolidated processing pipeline
            from models.processors import TaskProcessor

            processor = TaskProcessor(document_service=doc_service)
            result = await processor.process_document_standard(
                file_path=tmp_path,
                file_hash=document.id,  # Use connector document ID as hash
                owner_user_id=owner_user_id,
                original_filename=document.filename,  # Pass the original Google Doc title
                jwt_token=jwt_token,
                owner_name=owner_name,
                owner_email=owner_email,
                file_size=len(document.content) if document.content else 0,
                connector_type=connector_type,
            )

            logger.debug("Document processing result", result=result)

            # If successfully indexed or already exists, update the indexed documents with connector metadata
            if result["status"] in ["indexed", "unchanged"]:
                # Update all chunks with connector-specific metadata
                await self._update_connector_metadata(
                    document, owner_user_id, connector_type, jwt_token
                )

            return {
                **result,
                "filename": document.filename,
                "source_url": document.source_url,
            }

    async def _update_connector_metadata(
        self,
        document: ConnectorDocument,
        owner_user_id: str,
        connector_type: str,
        jwt_token: str = None,
    ):
        """Update indexed chunks with connector-specific metadata"""
        logger.debug("Looking for chunks", document_id=document.id)

        # Find all chunks for this document
        query = {"query": {"term": {"document_id": document.id}}}

        # Get user's OpenSearch client
        opensearch_client = self.session_manager.get_user_opensearch_client(
            owner_user_id, jwt_token
        )

        try:
            response = await opensearch_client.search(index=self.index_name, body=query)
        except Exception as e:
            logger.error(
                "OpenSearch search failed for connector metadata update",
                error=str(e),
                query=query,
            )
            raise

        logger.debug(
            "Search query executed",
            query=query,
            chunks_found=len(response["hits"]["hits"]),
            document_id=document.id,
        )

        # Update each chunk with connector metadata
        logger.debug(
            "Updating chunks with connector_type",
            chunk_count=len(response["hits"]["hits"]),
            connector_type=connector_type,
        )
        for hit in response["hits"]["hits"]:
            chunk_id = hit["_id"]
            current_connector_type = hit["_source"].get("connector_type", "unknown")
            logger.debug(
                "Updating chunk connector metadata",
                chunk_id=chunk_id,
                current_connector_type=current_connector_type,
                new_connector_type=connector_type,
            )

            update_body = {
                "doc": {
                    "source_url": document.source_url,
                    "connector_type": connector_type,  # Override the "local" set by process_file_common
                    # Additional ACL info beyond owner (already set by process_file_common)
                    "allowed_users": document.acl.allowed_users,
                    "allowed_groups": document.acl.allowed_groups,
                    "user_permissions": document.acl.user_permissions,
                    "group_permissions": document.acl.group_permissions,
                    # Timestamps
                    "created_time": document.created_time.isoformat()
                    if document.created_time
                    else None,
                    "modified_time": document.modified_time.isoformat()
                    if document.modified_time
                    else None,
                    # Additional metadata
                    "metadata": document.metadata,
                }
            }

            try:
                await opensearch_client.update(
                    index=self.index_name, id=chunk_id, body=update_body
                )
                logger.debug("Updated chunk with connector metadata", chunk_id=chunk_id)
            except Exception as e:
                logger.error(
                    "OpenSearch update failed for chunk",
                    chunk_id=chunk_id,
                    error=str(e),
                    update_body=update_body,
                )
                raise

    def _get_file_extension(self, mimetype: str) -> str:
        """Get file extension based on MIME type"""
        mime_to_ext = {
            "application/pdf": ".pdf",
            "application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx",
            "application/msword": ".doc",
            "application/vnd.openxmlformats-officedocument.presentationml.presentation": ".pptx",
            "application/vnd.ms-powerpoint": ".ppt",
            "text/plain": ".txt",
            "text/html": ".html",
            "application/rtf": ".rtf",
            "application/vnd.google-apps.document": ".pdf",  # Exported as PDF
            "application/vnd.google-apps.presentation": ".pdf",
            "application/vnd.google-apps.spreadsheet": ".pdf",
        }
        return mime_to_ext.get(mimetype, ".bin")

    async def sync_connector_files(
        self,
        connection_id: str,
        user_id: str,
        max_files: int = None,
        jwt_token: str = None,
    ) -> str:
        """Sync files from a connector connection using existing task tracking system"""
        if not self.task_service:
            raise ValueError(
                "TaskService not available - connector sync requires task service dependency"
            )

        logger.debug(
            "Starting sync for connection",
            connection_id=connection_id,
            max_files=max_files,
        )

        connector = await self.get_connector(connection_id)
        if not connector:
            raise ValueError(
                f"Connection '{connection_id}' not found or not authenticated"
            )

        logger.debug("Got connector", authenticated=connector.is_authenticated)

        if not connector.is_authenticated:
            raise ValueError(f"Connection '{connection_id}' not authenticated")

        # Collect files to process (limited by max_files)
        files_to_process = []
        page_token = None

        # Calculate page size to minimize API calls
        page_size = min(max_files or 100, 1000) if max_files else 100

        while True:
            # List files from connector with limit
            logger.debug(
                "Calling list_files", page_size=page_size, page_token=page_token
            )
            file_list = await connector.list_files(page_token, limit=page_size)
            logger.debug(
                "Got files from connector", file_count=len(file_list.get("files", []))
            )
            files = file_list["files"]

            if not files:
                break

            for file_info in files:
                if max_files and len(files_to_process) >= max_files:
                    break
                files_to_process.append(file_info)

            # Stop if we have enough files or no more pages
            if (max_files and len(files_to_process) >= max_files) or not file_list.get(
                "nextPageToken"
            ):
                break

            page_token = file_list.get("nextPageToken")

        # Get user information
        user = self.session_manager.get_user(user_id) if self.session_manager else None
        owner_name = user.name if user else None
        owner_email = user.email if user else None

        # Create custom processor for connector files
        from models.processors import ConnectorFileProcessor
        from services.document_service import DocumentService

        processor = ConnectorFileProcessor(
            self,
            connection_id,
            files_to_process,
            user_id,
            jwt_token=jwt_token,
            owner_name=owner_name,
            owner_email=owner_email,
            document_service=(
                self.task_service.document_service
                if self.task_service and self.task_service.document_service
                else DocumentService(session_manager=self.session_manager)
            ),
        )

        # Use file IDs as items (no more fake file paths!)
        file_ids = [file_info["id"] for file_info in files_to_process]

        # Create custom task using TaskService
        task_id = await self.task_service.create_custom_task(
            user_id, file_ids, processor
        )

        return task_id

    async def sync_specific_files(
        self,
        connection_id: str,
        user_id: str,
        file_ids: List[str],
        jwt_token: str = None,
    ) -> str:
        """
        Sync specific files by their IDs (used for webhook-triggered syncs or manual selection).
        Automatically expands folders to their contents.
        """
        if not self.task_service:
            raise ValueError(
                "TaskService not available - connector sync requires task service dependency"
            )

        connector = await self.get_connector(connection_id)
        if not connector:
            raise ValueError(
                f"Connection '{connection_id}' not found or not authenticated"
            )

        if not connector.is_authenticated:
            raise ValueError(f"Connection '{connection_id}' not authenticated")

        if not file_ids:
            raise ValueError("No file IDs provided")

        # Get user information
        user = self.session_manager.get_user(user_id) if self.session_manager else None
        owner_name = user.name if user else None
        owner_email = user.email if user else None

        # Temporarily set file_ids in the connector's config so list_files() can use them
        # Store the original values to restore later
        original_file_ids = None
        original_folder_ids = None

        if hasattr(connector, "cfg"):
            original_file_ids = getattr(connector.cfg, "file_ids", None)
            original_folder_ids = getattr(connector.cfg, "folder_ids", None)

        try:
            # Set the file_ids we want to sync in the connector's config
            if hasattr(connector, "cfg"):
                connector.cfg.file_ids = file_ids  # type: ignore
                connector.cfg.folder_ids = None  # type: ignore

            # Get the expanded list of file IDs (folders will be expanded to their contents)
            # This uses the connector's list_files() which calls _iter_selected_items()
            result = await connector.list_files()
            expanded_file_ids = [f["id"] for f in result.get("files", [])]

            if not expanded_file_ids:
                logger.warning(
                    f"No files found after expanding file_ids. "
                    f"Original IDs: {file_ids}. This may indicate all IDs were folders "
                    f"with no contents, or files that were filtered out."
                )
                # Return empty task rather than failing
                raise ValueError("No files to sync after expanding folders")

        except Exception as e:
            logger.error(f"Failed to expand file_ids via list_files(): {e}")
            # Fallback to original file_ids if expansion fails
            expanded_file_ids = file_ids
        finally:
            # Restore original config values
            if hasattr(connector, "cfg"):
                connector.cfg.file_ids = original_file_ids  # type: ignore
                connector.cfg.folder_ids = original_folder_ids  # type: ignore

        # Create custom processor for specific connector files
        from models.processors import ConnectorFileProcessor
        from services.document_service import DocumentService

        # Use expanded_file_ids which has folders already expanded
        processor = ConnectorFileProcessor(
            self,
            connection_id,
            expanded_file_ids,
            user_id,
            jwt_token=jwt_token,
            owner_name=owner_name,
            owner_email=owner_email,
            document_service=(
                self.task_service.document_service
                if self.task_service and self.task_service.document_service
                else DocumentService(session_manager=self.session_manager)
            ),
        )

        # Create custom task using TaskService
        task_id = await self.task_service.create_custom_task(
            user_id, expanded_file_ids, processor
        )

        return task_id

    async def _get_connector(self, connection_id: str) -> Optional[BaseConnector]:
        """Get a connector by connection ID (alias for get_connector)"""
        return await self.get_connector(connection_id)