diff --git a/common/data_source/config.py b/common/data_source/config.py index 0c038c6d7..751d1f33c 100644 --- a/common/data_source/config.py +++ b/common/data_source/config.py @@ -50,6 +50,7 @@ class DocumentSource(str, Enum): DISCORD = "discord" MOODLE = "moodle" S3_COMPATIBLE = "s3_compatible" + DROPBOX = "dropbox" class FileOrigin(str, Enum): diff --git a/common/data_source/dropbox_connector.py b/common/data_source/dropbox_connector.py index fd349baa1..97a4f0b60 100644 --- a/common/data_source/dropbox_connector.py +++ b/common/data_source/dropbox_connector.py @@ -1,13 +1,24 @@ """Dropbox connector""" +import logging +from datetime import timezone from typing import Any from dropbox import Dropbox from dropbox.exceptions import ApiError, AuthError +from dropbox.files import FileMetadata, FolderMetadata -from common.data_source.config import INDEX_BATCH_SIZE -from common.data_source.exceptions import ConnectorValidationError, InsufficientPermissionsError, ConnectorMissingCredentialError +from common.data_source.config import INDEX_BATCH_SIZE, DocumentSource +from common.data_source.exceptions import ( + ConnectorMissingCredentialError, + ConnectorValidationError, + InsufficientPermissionsError, +) from common.data_source.interfaces import LoadConnector, PollConnector, SecondsSinceUnixEpoch +from common.data_source.models import Document, GenerateDocumentsOutput +from common.data_source.utils import get_file_ext + +logger = logging.getLogger(__name__) class DropboxConnector(LoadConnector, PollConnector): @@ -19,29 +30,29 @@ class DropboxConnector(LoadConnector, PollConnector): def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None: """Load Dropbox credentials""" - try: - access_token = credentials.get("dropbox_access_token") - if not access_token: - raise ConnectorMissingCredentialError("Dropbox access token is required") - - self.dropbox_client = Dropbox(access_token) - return None - except Exception as e: - raise ConnectorMissingCredentialError(f"Dropbox: {e}") + access_token = credentials.get("dropbox_access_token") + if not access_token: + raise ConnectorMissingCredentialError("Dropbox access token is required") + + self.dropbox_client = Dropbox(access_token) + return None def validate_connector_settings(self) -> None: """Validate Dropbox connector settings""" - if not self.dropbox_client: + if self.dropbox_client is None: raise ConnectorMissingCredentialError("Dropbox") - + try: - # Test connection by getting current account info - self.dropbox_client.users_get_current_account() - except (AuthError, ApiError) as e: - if "invalid_access_token" in str(e).lower(): - raise InsufficientPermissionsError("Invalid Dropbox access token") - else: - raise ConnectorValidationError(f"Dropbox validation error: {e}") + self.dropbox_client.files_list_folder(path="", limit=1) + except AuthError as e: + logger.exception("[Dropbox]: Failed to validate Dropbox credentials") + raise ConnectorValidationError(f"Dropbox credential is invalid: {e}") + except ApiError as e: + if e.error is not None and "insufficient_permissions" in str(e.error).lower(): + raise InsufficientPermissionsError("Your Dropbox token does not have sufficient permissions.") + raise ConnectorValidationError(f"Unexpected Dropbox error during validation: {e.user_message_text or e}") + except Exception as e: + raise ConnectorValidationError(f"Unexpected error during Dropbox settings validation: {e}") def _download_file(self, path: str) -> bytes: """Download a single file from Dropbox.""" @@ -54,26 +65,105 @@ class DropboxConnector(LoadConnector, PollConnector): """Create a shared link for a file in Dropbox.""" if self.dropbox_client is None: raise ConnectorMissingCredentialError("Dropbox") - + try: - # Try to get existing shared links first shared_links = self.dropbox_client.sharing_list_shared_links(path=path) if shared_links.links: return shared_links.links[0].url - - # Create a new shared link - link_settings = self.dropbox_client.sharing_create_shared_link_with_settings(path) - return link_settings.url - except Exception: - # Fallback to basic link format - return f"https://www.dropbox.com/home{path}" - def poll_source(self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch) -> Any: + link_metadata = self.dropbox_client.sharing_create_shared_link_with_settings(path) + return link_metadata.url + except ApiError as err: + logger.exception(f"[Dropbox]: Failed to create a shared link for {path}: {err}") + return "" + + def _yield_files_recursive( + self, + path: str, + start: SecondsSinceUnixEpoch | None, + end: SecondsSinceUnixEpoch | None, + ) -> GenerateDocumentsOutput: + """Yield files in batches from a specified Dropbox folder, including subfolders.""" + if self.dropbox_client is None: + raise ConnectorMissingCredentialError("Dropbox") + + result = self.dropbox_client.files_list_folder( + path, + limit=self.batch_size, + recursive=False, + include_non_downloadable_files=False, + ) + + while True: + batch: list[Document] = [] + for entry in result.entries: + if isinstance(entry, FileMetadata): + modified_time = entry.client_modified + if modified_time.tzinfo is None: + modified_time = modified_time.replace(tzinfo=timezone.utc) + else: + modified_time = modified_time.astimezone(timezone.utc) + + time_as_seconds = modified_time.timestamp() + if start is not None and time_as_seconds < start: + continue + if end is not None and time_as_seconds > end: + continue + + try: + downloaded_file = self._download_file(entry.path_display) + except Exception: + logger.exception(f"[Dropbox]: Error downloading file {entry.path_display}") + continue + + batch.append( + Document( + id=f"dropbox:{entry.id}", + blob=downloaded_file, + source=DocumentSource.DROPBOX, + semantic_identifier=entry.name, + extension=get_file_ext(entry.name), + doc_updated_at=modified_time, + size_bytes=entry.size if getattr(entry, "size", None) is not None else len(downloaded_file), + ) + ) + + elif isinstance(entry, FolderMetadata): + yield from self._yield_files_recursive(entry.path_lower, start, end) + + if batch: + yield batch + + if not result.has_more: + break + + result = self.dropbox_client.files_list_folder_continue(result.cursor) + + def poll_source(self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch) -> GenerateDocumentsOutput: """Poll Dropbox for recent file changes""" - # Simplified implementation - in production this would handle actual polling - return [] + if self.dropbox_client is None: + raise ConnectorMissingCredentialError("Dropbox") - def load_from_state(self) -> Any: + for batch in self._yield_files_recursive("", start, end): + yield batch + + def load_from_state(self) -> GenerateDocumentsOutput: """Load files from Dropbox state""" - # Simplified implementation - return [] \ No newline at end of file + return self._yield_files_recursive("", None, None) + + +if __name__ == "__main__": + import os + + logging.basicConfig(level=logging.DEBUG) + connector = DropboxConnector() + connector.load_credentials({"dropbox_access_token": os.environ.get("DROPBOX_ACCESS_TOKEN")}) + connector.validate_connector_settings() + document_batches = connector.load_from_state() + try: + first_batch = next(document_batches) + print(f"Loaded {len(first_batch)} documents in first batch.") + for doc in first_batch: + print(f"- {doc.semantic_identifier} ({doc.size_bytes} bytes)") + except StopIteration: + print("No documents available in Dropbox.")