openrag/src/connectors/sharepoint/connector.py

import logging
from pathlib import Path
from typing import List, Dict, Any, Optional
from urllib.parse import urlparse
from datetime import datetime
import httpx

from connectors.base import BaseConnector, ConnectorDocument, DocumentACL
from connectors.sharepoint.oauth import SharePointOAuth

logger = logging.getLogger(__name__)


class SharePointConnector(BaseConnector):
    """SharePoint connector using MSAL-based OAuth for authentication"""

    # Required BaseConnector class attributes
    CLIENT_ID_ENV_VAR = "MICROSOFT_GRAPH_OAUTH_CLIENT_ID"
    CLIENT_SECRET_ENV_VAR = "MICROSOFT_GRAPH_OAUTH_CLIENT_SECRET"

    # Connector metadata
    CONNECTOR_NAME = "SharePoint"
    CONNECTOR_DESCRIPTION = "Add knowledge from SharePoint"
    CONNECTOR_ICON = "sharepoint"

    def __init__(self, config: Dict[str, Any]):
        super().__init__(config)

        logger.debug(
            f"SharePoint connector __init__ called with config type: {type(config)}"
        )
        logger.debug(f"SharePoint connector __init__ config value: {config}")

        # Ensure we always pass a valid config to the base class
        if config is None:
            logger.debug("Config was None, using empty dict")
            config = {}

        try:
            logger.debug("Calling super().__init__")
            super().__init__(config)  # Now safe to call with empty dict instead of None
            logger.debug("super().__init__ completed successfully")
        except Exception as e:
            logger.error(f"super().__init__ failed: {e}")
            raise

        # Initialize with defaults that allow the connector to be listed
        self.client_id = None
        self.client_secret = None
        self.tenant_id = config.get("tenant_id", "common")
        self.sharepoint_url = config.get("sharepoint_url")
        self.redirect_uri = config.get("redirect_uri", "http://localhost")

        # Try to get credentials, but don't fail if they're missing
        try:
            logger.debug("Attempting to get client_id")
            self.client_id = self.get_client_id()
            logger.debug(f"Got client_id: {self.client_id is not None}")
        except Exception as e:
            logger.debug(f"Failed to get client_id: {e}")
            pass  # Credentials not available, that's OK for listing

        try:
            logger.debug("Attempting to get client_secret")
            self.client_secret = self.get_client_secret()
            logger.debug(f"Got client_secret: {self.client_secret is not None}")
        except Exception as e:
            logger.debug(f"Failed to get client_secret: {e}")
            pass  # Credentials not available, that's OK for listing

        # Token file setup
        project_root = Path(__file__).resolve().parent.parent.parent.parent
        token_file = config.get("token_file") or str(
            project_root / "sharepoint_token.json"
        )
        Path(token_file).parent.mkdir(parents=True, exist_ok=True)

        # Only initialize OAuth if we have credentials
        if self.client_id and self.client_secret:
            connection_id = config.get("connection_id", "default")

            # Use token_file from config if provided, otherwise generate one
            if config.get("token_file"):
                oauth_token_file = config["token_file"]
            else:
                oauth_token_file = f"sharepoint_token_{connection_id}.json"

            authority = (
                f"https://login.microsoftonline.com/{self.tenant_id}"
                if self.tenant_id != "common"
                else "https://login.microsoftonline.com/common"
            )

            self.oauth = SharePointOAuth(
                client_id=self.client_id,
                client_secret=self.client_secret,
                token_file=oauth_token_file,
                authority=authority,
            )
        else:
            self.oauth = None

        # Track subscription ID for webhooks
        self._subscription_id: Optional[str] = None

        # Add Graph API defaults similar to Google Drive flags
        self._graph_api_version = "v1.0"
        self._default_params = {
            "$select": "id,name,size,lastModifiedDateTime,createdDateTime,webUrl,file,folder,@microsoft.graph.downloadUrl"
        }

        # Selective sync support (similar to Google Drive and OneDrive)
        self.cfg = type(
            "SharePointConfig",
            (),
            {
                "file_ids": config.get("file_ids")
                or config.get("selected_files")
                or config.get("selected_file_ids"),
                "folder_ids": config.get("folder_ids")
                or config.get("selected_folders")
                or config.get("selected_folder_ids"),
            },
        )()

    @property
    def _graph_base_url(self) -> str:
        """Base URL for Microsoft Graph API calls"""
        return f"https://graph.microsoft.com/{self._graph_api_version}"

    def emit(self, doc: ConnectorDocument) -> None:
        """
        Emit a ConnectorDocument instance.
        """
        logger.debug(f"Emitting SharePoint document: {doc.id} ({doc.filename})")

    async def authenticate(self) -> bool:
        """Test authentication - BaseConnector interface"""
        logger.debug(
            f"SharePoint authenticate() called, oauth is None: {self.oauth is None}"
        )
        try:
            if not self.oauth:
                logger.debug("SharePoint authentication failed: OAuth not initialized")
                self._authenticated = False
                return False

            logger.debug("Loading SharePoint credentials...")
            # Try to load existing credentials first
            load_result = await self.oauth.load_credentials()
            logger.debug(f"Load credentials result: {load_result}")

            logger.debug("Checking SharePoint authentication status...")
            authenticated = await self.oauth.is_authenticated()
            logger.debug(f"SharePoint is_authenticated result: {authenticated}")

            self._authenticated = authenticated
            return authenticated
        except Exception as e:
            logger.error(f"SharePoint authentication failed: {e}")
            import traceback

            traceback.print_exc()
            self._authenticated = False
            return False

    def get_auth_url(self) -> str:
        """Get OAuth authorization URL"""
        if not self.oauth:
            raise RuntimeError("SharePoint OAuth not initialized - missing credentials")
        return self.oauth.create_authorization_url(self.redirect_uri)

    async def handle_oauth_callback(self, auth_code: str) -> Dict[str, Any]:
        """Handle OAuth callback"""
        if not self.oauth:
            raise RuntimeError("SharePoint OAuth not initialized - missing credentials")
        try:
            success = await self.oauth.handle_authorization_callback(
                auth_code, self.redirect_uri
            )
            if success:
                self._authenticated = True
                return {"status": "success"}
            else:
                raise ValueError("OAuth callback failed")
        except Exception as e:
            logger.error(f"OAuth callback failed: {e}")
            raise

    def sync_once(self) -> None:
        """
        Perform a one-shot sync of SharePoint files and emit documents.
        This method mirrors the Google Drive connector's sync_once functionality.
        """
        import asyncio

        async def _async_sync():
            try:
                # Get list of files
                file_list = await self.list_files(max_files=1000)  # Adjust as needed
                files = file_list.get("files", [])

                for file_info in files:
                    try:
                        file_id = file_info.get("id")
                        if not file_id:
                            continue

                        # Get full document content
                        doc = await self.get_file_content(file_id)
                        self.emit(doc)

                    except Exception as e:
                        logger.error(
                            f"Failed to sync SharePoint file {file_info.get('name', 'unknown')}: {e}"
                        )
                        continue

            except Exception as e:
                logger.error(f"SharePoint sync_once failed: {e}")
                raise

        # Run the async sync
        if hasattr(asyncio, "run"):
            asyncio.run(_async_sync())
        else:
            # Python < 3.7 compatibility
            loop = asyncio.get_event_loop()
            loop.run_until_complete(_async_sync())

    async def setup_subscription(self) -> str:
        """Set up real-time subscription for file changes - BaseConnector interface"""
        webhook_url = self.config.get("webhook_url")
        if not webhook_url:
            logger.warning(
                "No webhook URL configured, skipping SharePoint subscription setup"
            )
            return "no-webhook-configured"

        try:
            # Ensure we're authenticated
            if not await self.authenticate():
                raise RuntimeError(
                    "SharePoint authentication failed during subscription setup"
                )

            token = self.oauth.get_access_token()

            # Microsoft Graph subscription for SharePoint site
            site_info = self._parse_sharepoint_url()
            if site_info:
                resource = f"sites/{site_info['host_name']}:/sites/{site_info['site_name']}:/drive/root"
            else:
                resource = "/me/drive/root"

            subscription_data = {
                "changeType": "created,updated,deleted",
                "notificationUrl": f"{webhook_url}/webhook/sharepoint",
                "resource": resource,
                "expirationDateTime": self._get_subscription_expiry(),
                "clientState": f"sharepoint_{self.tenant_id}",
            }

            headers = {
                "Authorization": f"Bearer {token}",
                "Content-Type": "application/json",
            }

            url = f"{self._graph_base_url}/subscriptions"

            async with httpx.AsyncClient() as client:
                response = await client.post(
                    url, json=subscription_data, headers=headers, timeout=30
                )
                response.raise_for_status()

                result = response.json()
                subscription_id = result.get("id")

                if subscription_id:
                    self._subscription_id = subscription_id
                    logger.info(f"SharePoint subscription created: {subscription_id}")
                    return subscription_id
                else:
                    raise ValueError("No subscription ID returned from Microsoft Graph")

        except Exception as e:
            logger.error(f"Failed to setup SharePoint subscription: {e}")
            raise

    def _get_subscription_expiry(self) -> str:
        """Get subscription expiry time (max 3 days for Graph API)"""
        from datetime import datetime, timedelta

        expiry = datetime.utcnow() + timedelta(days=3)  # 3 days max for Graph
        return expiry.strftime("%Y-%m-%dT%H:%M:%S.%fZ")

    def _parse_sharepoint_url(self) -> Optional[Dict[str, str]]:
        """Parse SharePoint URL to extract site information for Graph API"""
        if not self.sharepoint_url:
            return None

        try:
            parsed = urlparse(self.sharepoint_url)
            # Extract hostname and site name from URL like: https://contoso.sharepoint.com/sites/teamsite
            host_name = parsed.netloc
            path_parts = parsed.path.strip("/").split("/")

            if len(path_parts) >= 2 and path_parts[0] == "sites":
                site_name = path_parts[1]
                return {"host_name": host_name, "site_name": site_name}
        except Exception as e:
            logger.warning(f"Could not parse SharePoint URL {self.sharepoint_url}: {e}")

        return None

    async def list_files(
        self,
        page_token: Optional[str] = None,
        max_files: Optional[int] = None,
        **kwargs,
    ) -> Dict[str, Any]:
        """List all files using Microsoft Graph API - BaseConnector interface"""
        try:
            # Ensure authentication
            if not await self.authenticate():
                raise RuntimeError(
                    "SharePoint authentication failed during file listing"
                )

            # If file_ids or folder_ids are specified in config, use selective sync
            if self.cfg.file_ids or self.cfg.folder_ids:
                return await self._list_selected_files()

            files = []
            max_files_value = max_files if max_files is not None else 100

            # Build Graph API URL for the site or fallback to user's OneDrive
            site_info = self._parse_sharepoint_url()
            if site_info:
                base_url = f"{self._graph_base_url}/sites/{site_info['host_name']}:/sites/{site_info['site_name']}:/drive/root/children"
            else:
                base_url = f"{self._graph_base_url}/me/drive/root/children"

            params = dict(self._default_params)
            params["$top"] = str(max_files_value)

            if page_token:
                params["$skiptoken"] = page_token

            response = await self._make_graph_request(base_url, params=params)
            data = response.json()

            items = data.get("value", [])
            for item in items:
                # Only include files, not folders
                if item.get("file"):
                    files.append(
                        {
                            "id": item.get("id", ""),
                            "name": item.get("name", ""),
                            "path": f"/drive/items/{item.get('id')}",
                            "size": int(item.get("size", 0)),
                            "modified": item.get("lastModifiedDateTime"),
                            "created": item.get("createdDateTime"),
                            "mime_type": item.get("file", {}).get(
                                "mimeType", self._get_mime_type(item.get("name", ""))
                            ),
                            "url": item.get("webUrl", ""),
                            "download_url": item.get("@microsoft.graph.downloadUrl"),
                        }
                    )

            # Check for next page
            next_page_token = None
            next_link = data.get("@odata.nextLink")
            if next_link:
                from urllib.parse import urlparse, parse_qs

                parsed = urlparse(next_link)
                query_params = parse_qs(parsed.query)
                if "$skiptoken" in query_params:
                    next_page_token = query_params["$skiptoken"][0]

            return {"files": files, "next_page_token": next_page_token}

        except Exception as e:
            logger.error(f"Failed to list SharePoint files: {e}")
            return {
                "files": [],
                "next_page_token": None,
            }  # Return empty result instead of raising

    async def get_file_content(self, file_id: str) -> ConnectorDocument:
        """Get file content and metadata - BaseConnector interface"""
        try:
            # Ensure authentication
            if not await self.authenticate():
                raise RuntimeError(
                    "SharePoint authentication failed during file content retrieval"
                )

            # First get file metadata using Graph API
            file_metadata = await self._get_file_metadata_by_id(file_id)

            if not file_metadata:
                raise ValueError(f"File not found: {file_id}")

            # Download file content
            download_url = file_metadata.get("download_url")
            if download_url:
                content = await self._download_file_from_url(download_url)
            else:
                content = await self._download_file_content(file_id)

            # Create ACL from metadata
            acl = DocumentACL(
                owner="",  # Graph API requires additional calls for detailed permissions
                user_permissions={},
                group_permissions={},
            )

            # Parse dates
            modified_time = self._parse_graph_date(file_metadata.get("modified"))
            created_time = self._parse_graph_date(file_metadata.get("created"))

            return ConnectorDocument(
                id=file_id,
                filename=file_metadata.get("name", ""),
                mimetype=file_metadata.get("mime_type", "application/octet-stream"),
                content=content,
                source_url=file_metadata.get("url", ""),
                acl=acl,
                modified_time=modified_time,
                created_time=created_time,
                metadata={
                    "sharepoint_path": file_metadata.get("path", ""),
                    "sharepoint_url": self.sharepoint_url,
                    "size": file_metadata.get("size", 0),
                },
            )

        except Exception as e:
            logger.error(f"Failed to get SharePoint file content {file_id}: {e}")
            raise

    async def _get_file_metadata_by_id(self, file_id: str) -> Optional[Dict[str, Any]]:
        """Get file metadata by ID using Graph API"""
        try:
            # Try site-specific path first, then fallback to user drive
            site_info = self._parse_sharepoint_url()
            if site_info:
                url = f"{self._graph_base_url}/sites/{site_info['host_name']}:/sites/{site_info['site_name']}:/drive/items/{file_id}"
            else:
                url = f"{self._graph_base_url}/me/drive/items/{file_id}"

            params = dict(self._default_params)

            response = await self._make_graph_request(url, params=params)
            item = response.json()

            if item.get("file"):
                return {
                    "id": file_id,
                    "name": item.get("name", ""),
                    "path": f"/drive/items/{file_id}",
                    "size": int(item.get("size", 0)),
                    "modified": item.get("lastModifiedDateTime"),
                    "created": item.get("createdDateTime"),
                    "mime_type": item.get("file", {}).get(
                        "mimeType", self._get_mime_type(item.get("name", ""))
                    ),
                    "url": item.get("webUrl", ""),
                    "download_url": item.get("@microsoft.graph.downloadUrl"),
                }

            # Check if it's a folder
            if item.get("folder"):
                return {
                    "id": file_id,
                    "name": item.get("name", ""),
                    "isFolder": True,
                }

            return None

        except Exception as e:
            logger.error(f"Failed to get file metadata for {file_id}: {e}")
            return None

    async def _download_file_content(self, file_id: str) -> bytes:
        """Download file content by file ID using Graph API"""
        try:
            site_info = self._parse_sharepoint_url()
            if site_info:
                url = f"{self._graph_base_url}/sites/{site_info['host_name']}:/sites/{site_info['site_name']}:/drive/items/{file_id}/content"
            else:
                url = f"{self._graph_base_url}/me/drive/items/{file_id}/content"

            token = self.oauth.get_access_token()
            headers = {"Authorization": f"Bearer {token}"}

            async with httpx.AsyncClient() as client:
                response = await client.get(url, headers=headers, timeout=60)
                response.raise_for_status()
                return response.content

        except Exception as e:
            logger.error(f"Failed to download file content for {file_id}: {e}")
            raise

    async def _list_selected_files(self) -> Dict[str, Any]:
        """List only selected files/folders (selective sync)."""
        files: List[Dict[str, Any]] = []

        # Process selected file IDs
        if self.cfg.file_ids:
            for file_id in self.cfg.file_ids:
                try:
                    file_meta = await self._get_file_metadata_by_id(file_id)
                    if file_meta and not file_meta.get("isFolder", False):
                        files.append(file_meta)
                    elif file_meta and file_meta.get("isFolder", False):
                        # If it's a folder, expand its contents
                        folder_files = await self._list_folder_contents(file_id)
                        files.extend(folder_files)
                except Exception as e:
                    logger.warning(f"Failed to get file {file_id}: {e}")
                    continue

        # Process selected folder IDs
        if self.cfg.folder_ids:
            for folder_id in self.cfg.folder_ids:
                try:
                    folder_files = await self._list_folder_contents(folder_id)
                    files.extend(folder_files)
                except Exception as e:
                    logger.warning(f"Failed to list folder {folder_id}: {e}")
                    continue

        return {"files": files, "next_page_token": None}

    async def _list_folder_contents(self, folder_id: str) -> List[Dict[str, Any]]:
        """List all files in a folder recursively."""
        files: List[Dict[str, Any]] = []

        try:
            site_info = self._parse_sharepoint_url()
            if site_info:
                url = f"{self._graph_base_url}/sites/{site_info['host_name']}:/sites/{site_info['site_name']}:/drive/items/{folder_id}/children"
            else:
                url = f"{self._graph_base_url}/me/drive/items/{folder_id}/children"

            params = dict(self._default_params)

            response = await self._make_graph_request(url, params=params)
            data = response.json()

            items = data.get("value", [])
            for item in items:
                if item.get("file"):  # It's a file
                    file_meta = await self._get_file_metadata_by_id(item.get("id"))
                    if file_meta:
                        files.append(file_meta)
                elif item.get("folder"):  # It's a subfolder, recurse
                    subfolder_files = await self._list_folder_contents(item.get("id"))
                    files.extend(subfolder_files)
        except Exception as e:
            logger.error(f"Failed to list folder contents for {folder_id}: {e}")

        return files

    async def _download_file_from_url(self, download_url: str) -> bytes:
        """Download file content from direct download URL"""
        try:
            async with httpx.AsyncClient() as client:
                response = await client.get(download_url, timeout=60)
                response.raise_for_status()
                return response.content
        except Exception as e:
            logger.error(f"Failed to download from URL {download_url}: {e}")
            raise

    def _parse_graph_date(self, date_str: Optional[str]) -> datetime:
        """Parse Microsoft Graph date string to datetime"""
        if not date_str:
            return datetime.now()

        try:
            if date_str.endswith("Z"):
                return datetime.fromisoformat(date_str[:-1]).replace(tzinfo=None)
            else:
                return datetime.fromisoformat(date_str.replace("T", " "))
        except (ValueError, AttributeError):
            return datetime.now()

    async def _make_graph_request(
        self,
        url: str,
        method: str = "GET",
        data: Optional[Dict] = None,
        params: Optional[Dict] = None,
    ) -> httpx.Response:
        """Make authenticated API request to Microsoft Graph"""
        token = self.oauth.get_access_token()
        headers = {
            "Authorization": f"Bearer {token}",
            "Content-Type": "application/json",
        }

        async with httpx.AsyncClient() as client:
            if method.upper() == "GET":
                response = await client.get(
                    url, headers=headers, params=params, timeout=30
                )
            elif method.upper() == "POST":
                response = await client.post(
                    url, headers=headers, json=data, timeout=30
                )
            elif method.upper() == "DELETE":
                response = await client.delete(url, headers=headers, timeout=30)
            else:
                raise ValueError(f"Unsupported HTTP method: {method}")

            response.raise_for_status()
            return response

    def _get_mime_type(self, filename: str) -> str:
        """Get MIME type based on file extension"""
        import mimetypes

        mime_type, _ = mimetypes.guess_type(filename)
        return mime_type or "application/octet-stream"

    # Webhook methods - BaseConnector interface
    def handle_webhook_validation(
        self, request_method: str, headers: Dict[str, str], query_params: Dict[str, str]
    ) -> Optional[str]:
        """Handle webhook validation (Graph API specific)"""
        if request_method == "POST" and "validationToken" in query_params:
            return query_params["validationToken"]
        return None

    def extract_webhook_channel_id(
        self, payload: Dict[str, Any], headers: Dict[str, str]
    ) -> Optional[str]:
        """Extract channel/subscription ID from webhook payload"""
        notifications = payload.get("value", [])
        if notifications:
            return notifications[0].get("subscriptionId")
        return None

    async def handle_webhook(self, payload: Dict[str, Any]) -> List[str]:
        """Handle webhook notification and return affected file IDs"""
        affected_files = []

        # Process Microsoft Graph webhook payload
        notifications = payload.get("value", [])
        for notification in notifications:
            resource = notification.get("resource")
            if resource and "/drive/items/" in resource:
                file_id = resource.split("/drive/items/")[-1]
                affected_files.append(file_id)

        return affected_files

    async def cleanup_subscription(self, subscription_id: str) -> bool:
        """Clean up subscription - BaseConnector interface"""
        if subscription_id == "no-webhook-configured":
            logger.info("No subscription to cleanup (webhook was not configured)")
            return True

        try:
            # Ensure authentication
            if not await self.authenticate():
                logger.error(
                    "SharePoint authentication failed during subscription cleanup"
                )
                return False

            token = self.oauth.get_access_token()
            headers = {"Authorization": f"Bearer {token}"}

            url = f"{self._graph_base_url}/subscriptions/{subscription_id}"

            async with httpx.AsyncClient() as client:
                response = await client.delete(url, headers=headers, timeout=30)

                if response.status_code in [200, 204, 404]:
                    logger.info(
                        f"SharePoint subscription {subscription_id} cleaned up successfully"
                    )
                    return True
                else:
                    logger.warning(
                        f"Unexpected response cleaning up subscription: {response.status_code}"
                    )
                    return False

        except Exception as e:
            logger.error(
                f"Failed to cleanup SharePoint subscription {subscription_id}: {e}"
            )
            return False