openrag/src/connectors/sharepoint/connector.py
2025-09-19 12:58:52 -07:00

241 lines
8.9 KiB
Python

from pathlib import Path
import httpx
import uuid
from datetime import datetime, timedelta
from typing import Dict, List, Any, Optional
from ..base import BaseConnector, ConnectorDocument, DocumentACL
from .oauth import SharePointOAuth
class SharePointConnector(BaseConnector):
"""SharePoint Sites connector using Microsoft Graph API"""
CLIENT_ID_ENV_VAR = "MICROSOFT_GRAPH_OAUTH_CLIENT_ID"
CLIENT_SECRET_ENV_VAR = "MICROSOFT_GRAPH_OAUTH_CLIENT_SECRET"
# Connector metadata
CONNECTOR_NAME = "SharePoint"
CONNECTOR_DESCRIPTION = "Connect to SharePoint sites to sync team documents"
CONNECTOR_ICON = "sharepoint"
def __init__(self, config: Dict[str, Any]):
super().__init__(config)
project_root = Path(__file__).resolve().parent.parent.parent.parent
token_file = config.get("token_file") or str(project_root / "onedrive_token.json")
self.oauth = SharePointOAuth(
client_id=self.get_client_id(),
client_secret=self.get_client_secret(),
token_file=token_file,
)
self.subscription_id = config.get("subscription_id") or config.get(
"webhook_channel_id"
)
self.base_url = "https://graph.microsoft.com/v1.0"
# SharePoint site configuration
self.site_id = config.get("site_id") # Required for SharePoint
async def authenticate(self) -> bool:
if await self.oauth.is_authenticated():
self._authenticated = True
return True
return False
async def setup_subscription(self) -> str:
if not self._authenticated:
raise ValueError("Not authenticated")
webhook_url = self.config.get("webhook_url")
if not webhook_url:
raise ValueError("webhook_url required in config for subscriptions")
expiration = (datetime.utcnow() + timedelta(days=2)).isoformat() + "Z"
body = {
"changeType": "created,updated,deleted",
"notificationUrl": webhook_url,
"resource": f"/sites/{self.site_id}/drive/root",
"expirationDateTime": expiration,
"clientState": str(uuid.uuid4()),
}
token = self.oauth.get_access_token()
async with httpx.AsyncClient() as client:
resp = await client.post(
f"{self.base_url}/subscriptions",
json=body,
headers={"Authorization": f"Bearer {token}"},
)
resp.raise_for_status()
data = resp.json()
self.subscription_id = data["id"]
return self.subscription_id
async def list_files(
self, page_token: Optional[str] = None, limit: int = 100
) -> Dict[str, Any]:
if not self._authenticated:
raise ValueError("Not authenticated")
params = {"$top": str(limit)}
if page_token:
params["$skiptoken"] = page_token
token = self.oauth.get_access_token()
async with httpx.AsyncClient() as client:
resp = await client.get(
f"{self.base_url}/sites/{self.site_id}/drive/root/children",
params=params,
headers={"Authorization": f"Bearer {token}"},
)
resp.raise_for_status()
data = resp.json()
files = []
for item in data.get("value", []):
if item.get("file"):
files.append(
{
"id": item["id"],
"name": item["name"],
"mimeType": item.get("file", {}).get(
"mimeType", "application/octet-stream"
),
"webViewLink": item.get("webUrl"),
"createdTime": item.get("createdDateTime"),
"modifiedTime": item.get("lastModifiedDateTime"),
}
)
next_token = None
next_link = data.get("@odata.nextLink")
if next_link:
from urllib.parse import urlparse, parse_qs
parsed = urlparse(next_link)
next_token = parse_qs(parsed.query).get("$skiptoken", [None])[0]
return {"files": files, "nextPageToken": next_token}
async def get_file_content(self, file_id: str) -> ConnectorDocument:
if not self._authenticated:
raise ValueError("Not authenticated")
token = self.oauth.get_access_token()
headers = {"Authorization": f"Bearer {token}"}
async with httpx.AsyncClient() as client:
meta_resp = await client.get(
f"{self.base_url}/sites/{self.site_id}/drive/items/{file_id}",
headers=headers,
)
meta_resp.raise_for_status()
metadata = meta_resp.json()
content_resp = await client.get(
f"{self.base_url}/sites/{self.site_id}/drive/items/{file_id}/content",
headers=headers,
)
content = content_resp.content
# Handle the possibility of this being a redirect
if content_resp.status_code in (301, 302, 303, 307, 308):
redirect_url = content_resp.headers.get("Location")
if redirect_url:
content_resp = await client.get(redirect_url)
content_resp.raise_for_status()
content = content_resp.content
else:
content_resp.raise_for_status()
perm_resp = await client.get(
f"{self.base_url}/sites/{self.site_id}/drive/items/{file_id}/permissions",
headers=headers,
)
perm_resp.raise_for_status()
permissions = perm_resp.json()
acl = self._parse_permissions(metadata, permissions)
modified = datetime.fromisoformat(
metadata["lastModifiedDateTime"].replace("Z", "+00:00")
).replace(tzinfo=None)
created = datetime.fromisoformat(
metadata["createdDateTime"].replace("Z", "+00:00")
).replace(tzinfo=None)
document = ConnectorDocument(
id=metadata["id"],
filename=metadata["name"],
mimetype=metadata.get("file", {}).get(
"mimeType", "application/octet-stream"
),
content=content,
source_url=metadata.get("webUrl"),
acl=acl,
modified_time=modified,
created_time=created,
metadata={"size": metadata.get("size")},
)
return document
def _parse_permissions(
self, metadata: Dict[str, Any], permissions: Dict[str, Any]
) -> DocumentACL:
acl = DocumentACL()
owner = metadata.get("createdBy", {}).get("user", {}).get("email")
if owner:
acl.owner = owner
for perm in permissions.get("value", []):
role = perm.get("roles", ["read"])[0]
grantee = perm.get("grantedToV2") or perm.get("grantedTo")
if not grantee:
continue
user = grantee.get("user")
if user and user.get("email"):
acl.user_permissions[user["email"]] = role
group = grantee.get("group")
if group and group.get("email"):
acl.group_permissions[group["email"]] = role
return acl
def handle_webhook_validation(
self, request_method: str, headers: Dict[str, str], query_params: Dict[str, str]
) -> Optional[str]:
"""Handle Microsoft Graph webhook validation"""
if request_method == "GET":
validation_token = query_params.get("validationtoken") or query_params.get(
"validationToken"
)
if validation_token:
return validation_token
return None
def extract_webhook_channel_id(
self, payload: Dict[str, Any], headers: Dict[str, str]
) -> Optional[str]:
"""Extract SharePoint subscription ID from webhook payload"""
values = payload.get("value", [])
return values[0].get("subscriptionId") if values else None
async def handle_webhook(self, payload: Dict[str, Any]) -> List[str]:
values = payload.get("value", [])
file_ids = []
for item in values:
resource_data = item.get("resourceData", {})
file_id = resource_data.get("id")
if file_id:
file_ids.append(file_id)
return file_ids
async def cleanup_subscription(
self, subscription_id: str, resource_id: str = None
) -> bool:
if not self._authenticated:
return False
token = self.oauth.get_access_token()
async with httpx.AsyncClient() as client:
resp = await client.delete(
f"{self.base_url}/subscriptions/{subscription_id}",
headers={"Authorization": f"Bearer {token}"},
)
return resp.status_code in (200, 204)