openrag/src/connectors/sharepoint/connector.py
2025-12-12 15:13:11 -05:00

645 lines
27 KiB
Python

import logging
from pathlib import Path
from typing import List, Dict, Any, Optional
from urllib.parse import urlparse
from datetime import datetime
import httpx
from ..base import BaseConnector, ConnectorDocument, DocumentACL
from .oauth import SharePointOAuth
logger = logging.getLogger(__name__)
class SharePointConnector(BaseConnector):
"""SharePoint connector using MSAL-based OAuth for authentication"""
# Required BaseConnector class attributes
CLIENT_ID_ENV_VAR = "MICROSOFT_GRAPH_OAUTH_CLIENT_ID"
CLIENT_SECRET_ENV_VAR = "MICROSOFT_GRAPH_OAUTH_CLIENT_SECRET" # pragma: allowlist secret
# Connector metadata
CONNECTOR_NAME = "SharePoint"
CONNECTOR_DESCRIPTION = "Add knowledge from SharePoint"
CONNECTOR_ICON = "sharepoint"
def __init__(self, config: Dict[str, Any]):
super().__init__(config)
logger.debug(f"SharePoint connector __init__ called with config type: {type(config)}")
logger.debug(f"SharePoint connector __init__ config value: {config}")
# Ensure we always pass a valid config to the base class
if config is None:
logger.debug("Config was None, using empty dict")
config = {}
try:
logger.debug("Calling super().__init__")
super().__init__(config) # Now safe to call with empty dict instead of None
logger.debug("super().__init__ completed successfully")
except Exception as e:
logger.error(f"super().__init__ failed: {e}")
raise
# Initialize with defaults that allow the connector to be listed
self.client_id = None
self.client_secret = None
self.tenant_id = config.get("tenant_id", "common")
self.sharepoint_url = config.get("sharepoint_url")
self.redirect_uri = config.get("redirect_uri", "http://localhost")
# Try to get credentials, but don't fail if they're missing
try:
logger.debug("Attempting to get client_id")
self.client_id = self.get_client_id()
logger.debug(f"Got client_id: {self.client_id is not None}")
except Exception as e:
logger.debug(f"Failed to get client_id: {e}")
pass # Credentials not available, that's OK for listing
try:
logger.debug("Attempting to get client_secret")
self.client_secret = self.get_client_secret()
logger.debug(f"Got client_secret: {self.client_secret is not None}")
except Exception as e:
logger.debug(f"Failed to get client_secret: {e}")
pass # Credentials not available, that's OK for listing
# Token file setup - use data/ directory for persistence
token_file = config.get("token_file") or "data/sharepoint_token.json"
Path(token_file).parent.mkdir(parents=True, exist_ok=True)
# Only initialize OAuth if we have credentials
if self.client_id and self.client_secret:
connection_id = config.get("connection_id", "default")
# Use token_file from config if provided, otherwise generate one
if config.get("token_file"):
oauth_token_file = config["token_file"]
else:
oauth_token_file = f"data/sharepoint_token_{connection_id}.json"
authority = f"https://login.microsoftonline.com/{self.tenant_id}" if self.tenant_id != "common" else "https://login.microsoftonline.com/common"
self.oauth = SharePointOAuth(
client_id=self.client_id,
client_secret=self.client_secret,
token_file=oauth_token_file,
authority=authority
)
else:
self.oauth = None
# Track subscription ID for webhooks
self._subscription_id: Optional[str] = None
# Add Graph API defaults similar to Google Drive flags
self._graph_api_version = "v1.0"
self._default_params = {
"$select": "id,name,size,lastModifiedDateTime,createdDateTime,webUrl,file,folder,@microsoft.graph.downloadUrl"
}
# Selective sync support (similar to Google Drive and OneDrive)
self.cfg = type('SharePointConfig', (), {
'file_ids': config.get('file_ids') or config.get('selected_files') or config.get('selected_file_ids'),
'folder_ids': config.get('folder_ids') or config.get('selected_folders') or config.get('selected_folder_ids'),
})()
@property
def _graph_base_url(self) -> str:
"""Base URL for Microsoft Graph API calls"""
return f"https://graph.microsoft.com/{self._graph_api_version}"
def emit(self, doc: ConnectorDocument) -> None:
"""
Emit a ConnectorDocument instance.
"""
logger.debug(f"Emitting SharePoint document: {doc.id} ({doc.filename})")
async def authenticate(self) -> bool:
"""Test authentication - BaseConnector interface"""
logger.debug(f"SharePoint authenticate() called, oauth is None: {self.oauth is None}")
try:
if not self.oauth:
logger.debug("SharePoint authentication failed: OAuth not initialized")
self._authenticated = False
return False
logger.debug("Loading SharePoint credentials...")
# Try to load existing credentials first
load_result = await self.oauth.load_credentials()
logger.debug(f"Load credentials result: {load_result}")
logger.debug("Checking SharePoint authentication status...")
authenticated = await self.oauth.is_authenticated()
logger.debug(f"SharePoint is_authenticated result: {authenticated}")
self._authenticated = authenticated
return authenticated
except Exception as e:
logger.error(f"SharePoint authentication failed: {e}")
import traceback
traceback.print_exc()
self._authenticated = False
return False
def get_auth_url(self) -> str:
"""Get OAuth authorization URL"""
if not self.oauth:
raise RuntimeError("SharePoint OAuth not initialized - missing credentials")
return self.oauth.create_authorization_url(self.redirect_uri)
async def handle_oauth_callback(self, auth_code: str) -> Dict[str, Any]:
"""Handle OAuth callback"""
if not self.oauth:
raise RuntimeError("SharePoint OAuth not initialized - missing credentials")
try:
success = await self.oauth.handle_authorization_callback(auth_code, self.redirect_uri)
if success:
self._authenticated = True
return {"status": "success"}
else:
raise ValueError("OAuth callback failed")
except Exception as e:
logger.error(f"OAuth callback failed: {e}")
raise
def sync_once(self) -> None:
"""
Perform a one-shot sync of SharePoint files and emit documents.
This method mirrors the Google Drive connector's sync_once functionality.
"""
import asyncio
async def _async_sync():
try:
# Get list of files
file_list = await self.list_files(max_files=1000) # Adjust as needed
files = file_list.get("files", [])
for file_info in files:
try:
file_id = file_info.get("id")
if not file_id:
continue
# Get full document content
doc = await self.get_file_content(file_id)
self.emit(doc)
except Exception as e:
logger.error(f"Failed to sync SharePoint file {file_info.get('name', 'unknown')}: {e}")
continue
except Exception as e:
logger.error(f"SharePoint sync_once failed: {e}")
raise
# Run the async sync
if hasattr(asyncio, 'run'):
asyncio.run(_async_sync())
else:
# Python < 3.7 compatibility
loop = asyncio.get_event_loop()
loop.run_until_complete(_async_sync())
async def setup_subscription(self) -> str:
"""Set up real-time subscription for file changes - BaseConnector interface"""
webhook_url = self.config.get('webhook_url')
if not webhook_url:
logger.warning("No webhook URL configured, skipping SharePoint subscription setup")
return "no-webhook-configured"
try:
# Ensure we're authenticated
if not await self.authenticate():
raise RuntimeError("SharePoint authentication failed during subscription setup")
token = self.oauth.get_access_token()
# Microsoft Graph subscription for SharePoint site
site_info = self._parse_sharepoint_url()
if site_info:
resource = f"sites/{site_info['host_name']}:/sites/{site_info['site_name']}:/drive/root"
else:
resource = "/me/drive/root"
subscription_data = {
"changeType": "created,updated,deleted",
"notificationUrl": f"{webhook_url}/webhook/sharepoint",
"resource": resource,
"expirationDateTime": self._get_subscription_expiry(),
"clientState": f"sharepoint_{self.tenant_id}"
}
headers = {
"Authorization": f"Bearer {token}",
"Content-Type": "application/json"
}
url = f"{self._graph_base_url}/subscriptions"
async with httpx.AsyncClient() as client:
response = await client.post(url, json=subscription_data, headers=headers, timeout=30)
response.raise_for_status()
result = response.json()
subscription_id = result.get("id")
if subscription_id:
self._subscription_id = subscription_id
logger.info(f"SharePoint subscription created: {subscription_id}")
return subscription_id
else:
raise ValueError("No subscription ID returned from Microsoft Graph")
except Exception as e:
logger.error(f"Failed to setup SharePoint subscription: {e}")
raise
def _get_subscription_expiry(self) -> str:
"""Get subscription expiry time (max 3 days for Graph API)"""
from datetime import datetime, timedelta
expiry = datetime.utcnow() + timedelta(days=3) # 3 days max for Graph
return expiry.strftime("%Y-%m-%dT%H:%M:%S.%fZ")
def _parse_sharepoint_url(self) -> Optional[Dict[str, str]]:
"""Parse SharePoint URL to extract site information for Graph API"""
if not self.sharepoint_url:
return None
try:
parsed = urlparse(self.sharepoint_url)
# Extract hostname and site name from URL like: https://contoso.sharepoint.com/sites/teamsite
host_name = parsed.netloc
path_parts = parsed.path.strip('/').split('/')
if len(path_parts) >= 2 and path_parts[0] == 'sites':
site_name = path_parts[1]
return {
"host_name": host_name,
"site_name": site_name
}
except Exception as e:
logger.warning(f"Could not parse SharePoint URL {self.sharepoint_url}: {e}")
return None
async def list_files(
self,
page_token: Optional[str] = None,
max_files: Optional[int] = None,
**kwargs
) -> Dict[str, Any]:
"""List all files using Microsoft Graph API - BaseConnector interface"""
try:
# Ensure authentication
if not await self.authenticate():
raise RuntimeError("SharePoint authentication failed during file listing")
# If file_ids or folder_ids are specified in config, use selective sync
if self.cfg.file_ids or self.cfg.folder_ids:
return await self._list_selected_files()
files = []
max_files_value = max_files if max_files is not None else 100
# Build Graph API URL for the site or fallback to user's OneDrive
site_info = self._parse_sharepoint_url()
if site_info:
base_url = f"{self._graph_base_url}/sites/{site_info['host_name']}:/sites/{site_info['site_name']}:/drive/root/children"
else:
base_url = f"{self._graph_base_url}/me/drive/root/children"
params = dict(self._default_params)
params["$top"] = str(max_files_value)
if page_token:
params["$skiptoken"] = page_token
response = await self._make_graph_request(base_url, params=params)
data = response.json()
items = data.get("value", [])
for item in items:
# Only include files, not folders
if item.get("file"):
files.append({
"id": item.get("id", ""),
"name": item.get("name", ""),
"path": f"/drive/items/{item.get('id')}",
"size": int(item.get("size", 0)),
"modified": item.get("lastModifiedDateTime"),
"created": item.get("createdDateTime"),
"mime_type": item.get("file", {}).get("mimeType", self._get_mime_type(item.get("name", ""))),
"url": item.get("webUrl", ""),
"download_url": item.get("@microsoft.graph.downloadUrl")
})
# Check for next page
next_page_token = None
next_link = data.get("@odata.nextLink")
if next_link:
from urllib.parse import urlparse, parse_qs
parsed = urlparse(next_link)
query_params = parse_qs(parsed.query)
if "$skiptoken" in query_params:
next_page_token = query_params["$skiptoken"][0]
return {
"files": files,
"next_page_token": next_page_token
}
except Exception as e:
logger.error(f"Failed to list SharePoint files: {e}")
return {"files": [], "next_page_token": None} # Return empty result instead of raising
async def get_file_content(self, file_id: str) -> ConnectorDocument:
"""Get file content and metadata - BaseConnector interface"""
try:
# Ensure authentication
if not await self.authenticate():
raise RuntimeError("SharePoint authentication failed during file content retrieval")
# First get file metadata using Graph API
file_metadata = await self._get_file_metadata_by_id(file_id)
if not file_metadata:
raise ValueError(f"File not found: {file_id}")
# Download file content
download_url = file_metadata.get("download_url")
if download_url:
content = await self._download_file_from_url(download_url)
else:
content = await self._download_file_content(file_id)
# Create ACL from metadata
acl = DocumentACL(
owner="", # Graph API requires additional calls for detailed permissions
user_permissions={},
group_permissions={}
)
# Parse dates
modified_time = self._parse_graph_date(file_metadata.get("modified"))
created_time = self._parse_graph_date(file_metadata.get("created"))
return ConnectorDocument(
id=file_id,
filename=file_metadata.get("name", ""),
mimetype=file_metadata.get("mime_type", "application/octet-stream"),
content=content,
source_url=file_metadata.get("url", ""),
acl=acl,
modified_time=modified_time,
created_time=created_time,
metadata={
"sharepoint_path": file_metadata.get("path", ""),
"sharepoint_url": self.sharepoint_url,
"size": file_metadata.get("size", 0)
}
)
except Exception as e:
logger.error(f"Failed to get SharePoint file content {file_id}: {e}")
raise
async def _get_file_metadata_by_id(self, file_id: str) -> Optional[Dict[str, Any]]:
"""Get file metadata by ID using Graph API"""
try:
# Try site-specific path first, then fallback to user drive
site_info = self._parse_sharepoint_url()
if site_info:
url = f"{self._graph_base_url}/sites/{site_info['host_name']}:/sites/{site_info['site_name']}:/drive/items/{file_id}"
else:
url = f"{self._graph_base_url}/me/drive/items/{file_id}"
params = dict(self._default_params)
response = await self._make_graph_request(url, params=params)
item = response.json()
if item.get("file"):
return {
"id": file_id,
"name": item.get("name", ""),
"path": f"/drive/items/{file_id}",
"size": int(item.get("size", 0)),
"modified": item.get("lastModifiedDateTime"),
"created": item.get("createdDateTime"),
"mime_type": item.get("file", {}).get("mimeType", self._get_mime_type(item.get("name", ""))),
"url": item.get("webUrl", ""),
"download_url": item.get("@microsoft.graph.downloadUrl")
}
# Check if it's a folder
if item.get("folder"):
return {
"id": file_id,
"name": item.get("name", ""),
"isFolder": True,
}
return None
except Exception as e:
logger.error(f"Failed to get file metadata for {file_id}: {e}")
return None
async def _download_file_content(self, file_id: str) -> bytes:
"""Download file content by file ID using Graph API"""
try:
site_info = self._parse_sharepoint_url()
if site_info:
url = f"{self._graph_base_url}/sites/{site_info['host_name']}:/sites/{site_info['site_name']}:/drive/items/{file_id}/content"
else:
url = f"{self._graph_base_url}/me/drive/items/{file_id}/content"
token = self.oauth.get_access_token()
headers = {"Authorization": f"Bearer {token}"}
async with httpx.AsyncClient() as client:
response = await client.get(url, headers=headers, timeout=60)
response.raise_for_status()
return response.content
except Exception as e:
logger.error(f"Failed to download file content for {file_id}: {e}")
raise
async def _list_selected_files(self) -> Dict[str, Any]:
"""List only selected files/folders (selective sync)."""
files: List[Dict[str, Any]] = []
# Process selected file IDs
if self.cfg.file_ids:
for file_id in self.cfg.file_ids:
try:
file_meta = await self._get_file_metadata_by_id(file_id)
if file_meta and not file_meta.get('isFolder', False):
files.append(file_meta)
elif file_meta and file_meta.get('isFolder', False):
# If it's a folder, expand its contents
folder_files = await self._list_folder_contents(file_id)
files.extend(folder_files)
except Exception as e:
logger.warning(f"Failed to get file {file_id}: {e}")
continue
# Process selected folder IDs
if self.cfg.folder_ids:
for folder_id in self.cfg.folder_ids:
try:
folder_files = await self._list_folder_contents(folder_id)
files.extend(folder_files)
except Exception as e:
logger.warning(f"Failed to list folder {folder_id}: {e}")
continue
return {"files": files, "next_page_token": None}
async def _list_folder_contents(self, folder_id: str) -> List[Dict[str, Any]]:
"""List all files in a folder recursively."""
files: List[Dict[str, Any]] = []
try:
site_info = self._parse_sharepoint_url()
if site_info:
url = f"{self._graph_base_url}/sites/{site_info['host_name']}:/sites/{site_info['site_name']}:/drive/items/{folder_id}/children"
else:
url = f"{self._graph_base_url}/me/drive/items/{folder_id}/children"
params = dict(self._default_params)
response = await self._make_graph_request(url, params=params)
data = response.json()
items = data.get("value", [])
for item in items:
if item.get("file"): # It's a file
file_meta = await self._get_file_metadata_by_id(item.get("id"))
if file_meta:
files.append(file_meta)
elif item.get("folder"): # It's a subfolder, recurse
subfolder_files = await self._list_folder_contents(item.get("id"))
files.extend(subfolder_files)
except Exception as e:
logger.error(f"Failed to list folder contents for {folder_id}: {e}")
return files
async def _download_file_from_url(self, download_url: str) -> bytes:
"""Download file content from direct download URL"""
try:
async with httpx.AsyncClient() as client:
response = await client.get(download_url, timeout=60)
response.raise_for_status()
return response.content
except Exception as e:
logger.error(f"Failed to download from URL {download_url}: {e}")
raise
def _parse_graph_date(self, date_str: Optional[str]) -> datetime:
"""Parse Microsoft Graph date string to datetime"""
if not date_str:
return datetime.now()
try:
if date_str.endswith('Z'):
return datetime.fromisoformat(date_str[:-1]).replace(tzinfo=None)
else:
return datetime.fromisoformat(date_str.replace('T', ' '))
except (ValueError, AttributeError):
return datetime.now()
async def _make_graph_request(self, url: str, method: str = "GET",
data: Optional[Dict] = None, params: Optional[Dict] = None) -> httpx.Response:
"""Make authenticated API request to Microsoft Graph"""
token = self.oauth.get_access_token()
headers = {
"Authorization": f"Bearer {token}",
"Content-Type": "application/json"
}
async with httpx.AsyncClient() as client:
if method.upper() == "GET":
response = await client.get(url, headers=headers, params=params, timeout=30)
elif method.upper() == "POST":
response = await client.post(url, headers=headers, json=data, timeout=30)
elif method.upper() == "DELETE":
response = await client.delete(url, headers=headers, timeout=30)
else:
raise ValueError(f"Unsupported HTTP method: {method}")
response.raise_for_status()
return response
def _get_mime_type(self, filename: str) -> str:
"""Get MIME type based on file extension"""
import mimetypes
mime_type, _ = mimetypes.guess_type(filename)
return mime_type or "application/octet-stream"
# Webhook methods - BaseConnector interface
def handle_webhook_validation(self, request_method: str, headers: Dict[str, str],
query_params: Dict[str, str]) -> Optional[str]:
"""Handle webhook validation (Graph API specific)"""
if request_method == "POST" and "validationToken" in query_params:
return query_params["validationToken"]
return None
def extract_webhook_channel_id(self, payload: Dict[str, Any],
headers: Dict[str, str]) -> Optional[str]:
"""Extract channel/subscription ID from webhook payload"""
notifications = payload.get("value", [])
if notifications:
return notifications[0].get("subscriptionId")
return None
async def handle_webhook(self, payload: Dict[str, Any]) -> List[str]:
"""Handle webhook notification and return affected file IDs"""
affected_files = []
# Process Microsoft Graph webhook payload
notifications = payload.get("value", [])
for notification in notifications:
resource = notification.get("resource")
if resource and "/drive/items/" in resource:
file_id = resource.split("/drive/items/")[-1]
affected_files.append(file_id)
return affected_files
async def cleanup_subscription(self, subscription_id: str) -> bool:
"""Clean up subscription - BaseConnector interface"""
if subscription_id == "no-webhook-configured":
logger.info("No subscription to cleanup (webhook was not configured)")
return True
try:
# Ensure authentication
if not await self.authenticate():
logger.error("SharePoint authentication failed during subscription cleanup")
return False
token = self.oauth.get_access_token()
headers = {"Authorization": f"Bearer {token}"}
url = f"{self._graph_base_url}/subscriptions/{subscription_id}"
async with httpx.AsyncClient() as client:
response = await client.delete(url, headers=headers, timeout=30)
if response.status_code in [200, 204, 404]:
logger.info(f"SharePoint subscription {subscription_id} cleaned up successfully")
return True
else:
logger.warning(f"Unexpected response cleaning up subscription: {response.status_code}")
return False
except Exception as e:
logger.error(f"Failed to cleanup SharePoint subscription {subscription_id}: {e}")
return False