This commit is contained in:
yongtenglei 2025-11-24 17:25:41 +08:00
parent 1009819801
commit 0b666e914a
2 changed files with 126 additions and 35 deletions

View file

@ -50,6 +50,7 @@ class DocumentSource(str, Enum):
DISCORD = "discord" DISCORD = "discord"
MOODLE = "moodle" MOODLE = "moodle"
S3_COMPATIBLE = "s3_compatible" S3_COMPATIBLE = "s3_compatible"
DROPBOX = "dropbox"
class FileOrigin(str, Enum): class FileOrigin(str, Enum):

View file

@ -1,13 +1,24 @@
"""Dropbox connector""" """Dropbox connector"""
import logging
from datetime import timezone
from typing import Any from typing import Any
from dropbox import Dropbox from dropbox import Dropbox
from dropbox.exceptions import ApiError, AuthError from dropbox.exceptions import ApiError, AuthError
from dropbox.files import FileMetadata, FolderMetadata
from common.data_source.config import INDEX_BATCH_SIZE from common.data_source.config import INDEX_BATCH_SIZE, DocumentSource
from common.data_source.exceptions import ConnectorValidationError, InsufficientPermissionsError, ConnectorMissingCredentialError from common.data_source.exceptions import (
ConnectorMissingCredentialError,
ConnectorValidationError,
InsufficientPermissionsError,
)
from common.data_source.interfaces import LoadConnector, PollConnector, SecondsSinceUnixEpoch from common.data_source.interfaces import LoadConnector, PollConnector, SecondsSinceUnixEpoch
from common.data_source.models import Document, GenerateDocumentsOutput
from common.data_source.utils import get_file_ext
logger = logging.getLogger(__name__)
class DropboxConnector(LoadConnector, PollConnector): class DropboxConnector(LoadConnector, PollConnector):
@ -19,29 +30,29 @@ class DropboxConnector(LoadConnector, PollConnector):
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None: def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
"""Load Dropbox credentials""" """Load Dropbox credentials"""
try: access_token = credentials.get("dropbox_access_token")
access_token = credentials.get("dropbox_access_token") if not access_token:
if not access_token: raise ConnectorMissingCredentialError("Dropbox access token is required")
raise ConnectorMissingCredentialError("Dropbox access token is required")
self.dropbox_client = Dropbox(access_token)
self.dropbox_client = Dropbox(access_token) return None
return None
except Exception as e:
raise ConnectorMissingCredentialError(f"Dropbox: {e}")
def validate_connector_settings(self) -> None: def validate_connector_settings(self) -> None:
"""Validate Dropbox connector settings""" """Validate Dropbox connector settings"""
if not self.dropbox_client: if self.dropbox_client is None:
raise ConnectorMissingCredentialError("Dropbox") raise ConnectorMissingCredentialError("Dropbox")
try: try:
# Test connection by getting current account info self.dropbox_client.files_list_folder(path="", limit=1)
self.dropbox_client.users_get_current_account() except AuthError as e:
except (AuthError, ApiError) as e: logger.exception("[Dropbox]: Failed to validate Dropbox credentials")
if "invalid_access_token" in str(e).lower(): raise ConnectorValidationError(f"Dropbox credential is invalid: {e}")
raise InsufficientPermissionsError("Invalid Dropbox access token") except ApiError as e:
else: if e.error is not None and "insufficient_permissions" in str(e.error).lower():
raise ConnectorValidationError(f"Dropbox validation error: {e}") raise InsufficientPermissionsError("Your Dropbox token does not have sufficient permissions.")
raise ConnectorValidationError(f"Unexpected Dropbox error during validation: {e.user_message_text or e}")
except Exception as e:
raise ConnectorValidationError(f"Unexpected error during Dropbox settings validation: {e}")
def _download_file(self, path: str) -> bytes: def _download_file(self, path: str) -> bytes:
"""Download a single file from Dropbox.""" """Download a single file from Dropbox."""
@ -54,26 +65,105 @@ class DropboxConnector(LoadConnector, PollConnector):
"""Create a shared link for a file in Dropbox.""" """Create a shared link for a file in Dropbox."""
if self.dropbox_client is None: if self.dropbox_client is None:
raise ConnectorMissingCredentialError("Dropbox") raise ConnectorMissingCredentialError("Dropbox")
try: try:
# Try to get existing shared links first
shared_links = self.dropbox_client.sharing_list_shared_links(path=path) shared_links = self.dropbox_client.sharing_list_shared_links(path=path)
if shared_links.links: if shared_links.links:
return shared_links.links[0].url return shared_links.links[0].url
# Create a new shared link
link_settings = self.dropbox_client.sharing_create_shared_link_with_settings(path)
return link_settings.url
except Exception:
# Fallback to basic link format
return f"https://www.dropbox.com/home{path}"
def poll_source(self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch) -> Any: link_metadata = self.dropbox_client.sharing_create_shared_link_with_settings(path)
return link_metadata.url
except ApiError as err:
logger.exception(f"[Dropbox]: Failed to create a shared link for {path}: {err}")
return ""
def _yield_files_recursive(
self,
path: str,
start: SecondsSinceUnixEpoch | None,
end: SecondsSinceUnixEpoch | None,
) -> GenerateDocumentsOutput:
"""Yield files in batches from a specified Dropbox folder, including subfolders."""
if self.dropbox_client is None:
raise ConnectorMissingCredentialError("Dropbox")
result = self.dropbox_client.files_list_folder(
path,
limit=self.batch_size,
recursive=False,
include_non_downloadable_files=False,
)
while True:
batch: list[Document] = []
for entry in result.entries:
if isinstance(entry, FileMetadata):
modified_time = entry.client_modified
if modified_time.tzinfo is None:
modified_time = modified_time.replace(tzinfo=timezone.utc)
else:
modified_time = modified_time.astimezone(timezone.utc)
time_as_seconds = modified_time.timestamp()
if start is not None and time_as_seconds < start:
continue
if end is not None and time_as_seconds > end:
continue
try:
downloaded_file = self._download_file(entry.path_display)
except Exception:
logger.exception(f"[Dropbox]: Error downloading file {entry.path_display}")
continue
batch.append(
Document(
id=f"dropbox:{entry.id}",
blob=downloaded_file,
source=DocumentSource.DROPBOX,
semantic_identifier=entry.name,
extension=get_file_ext(entry.name),
doc_updated_at=modified_time,
size_bytes=entry.size if getattr(entry, "size", None) is not None else len(downloaded_file),
)
)
elif isinstance(entry, FolderMetadata):
yield from self._yield_files_recursive(entry.path_lower, start, end)
if batch:
yield batch
if not result.has_more:
break
result = self.dropbox_client.files_list_folder_continue(result.cursor)
def poll_source(self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch) -> GenerateDocumentsOutput:
"""Poll Dropbox for recent file changes""" """Poll Dropbox for recent file changes"""
# Simplified implementation - in production this would handle actual polling if self.dropbox_client is None:
return [] raise ConnectorMissingCredentialError("Dropbox")
def load_from_state(self) -> Any: for batch in self._yield_files_recursive("", start, end):
yield batch
def load_from_state(self) -> GenerateDocumentsOutput:
"""Load files from Dropbox state""" """Load files from Dropbox state"""
# Simplified implementation return self._yield_files_recursive("", None, None)
return []
if __name__ == "__main__":
import os
logging.basicConfig(level=logging.DEBUG)
connector = DropboxConnector()
connector.load_credentials({"dropbox_access_token": os.environ.get("DROPBOX_ACCESS_TOKEN")})
connector.validate_connector_settings()
document_batches = connector.load_from_state()
try:
first_batch = next(document_batches)
print(f"Loaded {len(first_batch)} documents in first batch.")
for doc in first_batch:
print(f"- {doc.semantic_identifier} ({doc.size_bytes} bytes)")
except StopIteration:
print("No documents available in Dropbox.")