From fa7e3307f607009147ed59365b39a47879655a35 Mon Sep 17 00:00:00 2001 From: Billy Bao Date: Fri, 5 Dec 2025 19:18:07 +0800 Subject: [PATCH] add box --- common/constants.py | 2 +- common/data_source/box_connector.py | 167 ++++++++++++++++++ common/data_source/config.py | 3 +- common/data_source/google_util/constant.py | 2 +- pyproject.toml | 1 + rag/svr/sync_data_source.py | 30 ++++ uv.lock | 27 ++- web/src/assets/svg/data-source/box.svg | 1 + .../user-setting/data-source/contant.tsx | 61 ++++++- 9 files changed, 282 insertions(+), 12 deletions(-) create mode 100644 common/data_source/box_connector.py create mode 100644 web/src/assets/svg/data-source/box.svg diff --git a/common/constants.py b/common/constants.py index 4cced1266..0f4b9fc4d 100644 --- a/common/constants.py +++ b/common/constants.py @@ -122,7 +122,7 @@ class FileSource(StrEnum): WEBDAV = "webdav" MOODLE = "moodle" DROPBOX = "dropbox" - + BOX = "box" class PipelineTaskType(StrEnum): PARSE = "Parse" diff --git a/common/data_source/box_connector.py b/common/data_source/box_connector.py new file mode 100644 index 000000000..ed79ebc11 --- /dev/null +++ b/common/data_source/box_connector.py @@ -0,0 +1,167 @@ +"""Box connector""" +import os +import logging +from datetime import datetime, timezone +from typing import Any + +from box_sdk_gen import BoxClient +from common.data_source.config import DocumentSource, INDEX_BATCH_SIZE +from common.data_source.exceptions import ( + ConnectorMissingCredentialError, + ConnectorValidationError, +) +from common.data_source.interfaces import LoadConnector, PollConnector, SecondsSinceUnixEpoch +from common.data_source.models import Document, GenerateDocumentsOutput +from common.data_source.utils import get_file_ext + +class BoxConnector(LoadConnector, PollConnector): + def __init__(self, folder_id: str = "0", batch_size: int = INDEX_BATCH_SIZE, use_marker: bool = False) -> None: + self.batch_size = batch_size + self.folder_id = folder_id + self.use_marker = use_marker + + + def load_credentials(self, credentials): + auth = credentials.get("auth") + if not auth: + raise ConnectorMissingCredentialError("Box auth is required") + + self.box_client = BoxClient(auth=auth) + return None + + + def validate_connector_settings(self): + if self.box_client is None: + raise ConnectorMissingCredentialError("Box") + + try: + self.box_client.users.get_user_me() + except Exception as e: + logging.exception("[Box]: Failed to validate Box credentials") + raise ConnectorValidationError(f"Unexpected error during Box settings validation: {e}") + + + def _yield_files_recursive( + self, + folder_id, + start: SecondsSinceUnixEpoch | None, + end: SecondsSinceUnixEpoch | None + ) -> GenerateDocumentsOutput: + + if self.box_client is None: + raise ConnectorMissingCredentialError("Box") + + result = self.box_client.folders.get_folder_items( + folder_id=folder_id, + limit=self.batch_size, + usemarker=self.usemarker + ) + + while True: + batch: list[Document] = [] + for entry in result.entries: + if entry.type == 'file' : + file = self.box_client.files.get_file_by_id( + entry.id + ) + raw_time = ( + getattr(file, "created_at", None) + or getattr(file, "content_created_at", None) + ) + + if raw_time: + modified_time = self._box_datetime_to_epoch_seconds(raw_time) + if start is not None and modified_time <= start: + continue + if end is not None and modified_time > end: + continue + + content_bytes = self.box_client.downloads.download_file(file.id) + + batch.append( + Document( + id=f"box:{file.id}", + blob=content_bytes.read(), + source=DocumentSource.BOX, + semantic_identifier=file.name, + extension=get_file_ext(file.name), + doc_updated_at=modified_time, + size_bytes=file.size, + metadata=file.metadata + ) + ) + elif entry.type == 'folder': + yield from self._yield_files_recursive(folder_id=entry.id, start=start, end=end) + + if batch: + yield batch + + if not result.next_marker: + break + + result = self.box_client.folders.get_folder_items( + folder_id=folder_id, + limit=self.batch_size, + marker=result.next_marker, + usemarker=True + ) + + + def _box_datetime_to_epoch_seconds(self, dt: datetime) -> SecondsSinceUnixEpoch: + """Convert a Box SDK datetime to Unix epoch seconds (UTC). + Only supports datetime; any non-datetime should be filtered out by caller. + """ + if not isinstance(dt, datetime): + raise TypeError(f"box_datetime_to_epoch_seconds expects datetime, got {type(dt)}") + + if dt.tzinfo is None: + dt = dt.replace(tzinfo=timezone.utc) + else: + dt = dt.astimezone(timezone.utc) + + return SecondsSinceUnixEpoch(int(dt.timestamp())) + + + def poll_source(self, start, end): + return self._yield_files_recursive(folder_id=self.folder_id, start=start, end=end) + + + def load_from_state(self): + return self._yield_files_recursive(folder_id=self.folder_id, start=None, end=None) + + +# from flask import Flask, request, redirect + +# from box_sdk_gen import BoxClient, BoxOAuth, OAuthConfig, GetAuthorizeUrlOptions + +# app = Flask(__name__) + +# AUTH = BoxOAuth( +# OAuthConfig(client_id="8suvn9ik7qezsq2dub0ye6ubox61081z", client_secret="QScvhLgBcZrb2ck1QP1ovkutpRhI2QcN") +# ) + + +# @app.route("/") +# def get_auth(): +# auth_url = AUTH.get_authorize_url( +# options=GetAuthorizeUrlOptions(redirect_uri="http://localhost:4999/oauth2callback") +# ) +# return redirect(auth_url, code=302) + + +# @app.route("/oauth2callback") +# def callback(): +# AUTH.get_tokens_authorization_code_grant(request.args.get("code")) +# box = BoxConnector() +# box.load_credentials({"auth": AUTH}) + +# lst = [] +# for file in box.load_from_state(): +# for f in file: +# lst.append(f.semantic_identifier) + +# return lst + +if __name__ == "__main__": + pass + # app.run(port=4999) \ No newline at end of file diff --git a/common/data_source/config.py b/common/data_source/config.py index a3d86720c..04aa71901 100644 --- a/common/data_source/config.py +++ b/common/data_source/config.py @@ -52,7 +52,7 @@ class DocumentSource(str, Enum): MOODLE = "moodle" S3_COMPATIBLE = "s3_compatible" DROPBOX = "dropbox" - + BOX = "box" class FileOrigin(str, Enum): """File origins""" @@ -227,6 +227,7 @@ _DEFAULT_PAGINATION_LIMIT = 1000 _PROBLEMATIC_EXPANSIONS = "body.storage.value" _REPLACEMENT_EXPANSIONS = "body.view.value" +BOX_WEB_OAUTH_REDIRECT_URI = os.environ.get("BOX_WEB_OAUTH_REDIRECT_URI", "http://localhost:9380/v1/connector/box/oauth/web/callback") class HtmlBasedConnectorTransformLinksStrategy(str, Enum): # remove links entirely diff --git a/common/data_source/google_util/constant.py b/common/data_source/google_util/constant.py index 858ee31c8..89c9afaf5 100644 --- a/common/data_source/google_util/constant.py +++ b/common/data_source/google_util/constant.py @@ -49,7 +49,7 @@ MISSING_SCOPES_ERROR_STR = "client not authorized for any of the scopes requeste SCOPE_INSTRUCTIONS = "" -GOOGLE_WEB_OAUTH_POPUP_TEMPLATE = """ +WEB_OAUTH_POPUP_TEMPLATE = """ diff --git a/pyproject.toml b/pyproject.toml index 18a6f4f7d..9f65082ea 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -154,6 +154,7 @@ dependencies = [ "exceptiongroup>=1.3.0,<2.0.0", "ffmpeg-python>=0.2.0", "imageio-ffmpeg>=0.6.0", + "boxsdk>=10.1.0", ] [dependency-groups] diff --git a/rag/svr/sync_data_source.py b/rag/svr/sync_data_source.py index 4349b6f55..1f5c3df76 100644 --- a/rag/svr/sync_data_source.py +++ b/rag/svr/sync_data_source.py @@ -42,6 +42,7 @@ from common.constants import FileSource, TaskStatus from common.data_source.config import INDEX_BATCH_SIZE from common.data_source.confluence_connector import ConfluenceConnector from common.data_source.gmail_connector import GmailConnector +from common.data_source.box_connector import BoxConnector from common.data_source.interfaces import CheckpointOutputWrapper from common.data_source.utils import load_all_docs_from_checkpoint_connector from common.log_utils import init_root_logger @@ -599,6 +600,34 @@ class Moodle(SyncBase): return document_generator +class BOX(SyncBase): + SOURCE_NAME: str = FileSource.BOX + + async def _generate(self, task: dict): + self.connector = BoxConnector( + batch_size=self.conf.get("batch_size", INDEX_BATCH_SIZE), + folder_id=self.conf.get("folder_id", "0"), + use_marker=self.conf.get("use_marker", False) + ) + + self.connector.load_credentials(self.conf["credentials"]) + if task["reindex"] == "1" or not task["poll_range_start"]: + document_generator = self.connector.load_from_state() + begin_info = "totally" + else: + poll_start = task["poll_range_start"] + if poll_start is None: + document_generator = self.connector.load_from_state() + begin_info = "totally" + else: + document_generator = self.connector.poll_source( + poll_start.timestamp(), + datetime.now(timezone.utc).timestamp() + ) + begin_info = "from {}".format(poll_start) + logging.info("Connect to Box: folder_id({}) {}".format(self.conf["folder_id"], begin_info)) + return document_generator + func_factory = { FileSource.S3: S3, FileSource.NOTION: Notion, @@ -613,6 +642,7 @@ func_factory = { FileSource.MOODLE: Moodle, FileSource.DROPBOX: Dropbox, FileSource.WEBDAV: WebDAV, + FileSource.BOX: BOX } diff --git a/uv.lock b/uv.lock index 0537e6664..9a7cf7487 100644 --- a/uv.lock +++ b/uv.lock @@ -643,6 +643,19 @@ wheels = [ { url = "https://pypi.tuna.tsinghua.edu.cn/packages/f5/23/91c8b50588470d80317f4afca93d3d542139bdc38ed5ad1b512fba416af3/botocore-1.34.140-py3-none-any.whl", hash = "sha256:43940d3a67d946ba3301631ba4078476a75f1015d4fb0fb0272d0b754b2cf9de", size = 12354845, upload-time = "2024-07-05T19:19:10.578Z" }, ] +[[package]] +name = "boxsdk" +version = "10.1.0" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +dependencies = [ + { name = "requests" }, + { name = "requests-toolbelt" }, +] +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/44/a6/b7afb8ee4745b61470322ada33c0463b26d221367bc23c8e8c7d4b7b6cbe/boxsdk-10.1.0.tar.gz", hash = "sha256:fb409b682d173eeb9a72c03ca0dddf2e66dbd79199235815a2ad61bf39c4f231", size = 265229, upload-time = "2025-11-19T11:32:01.438Z" } +wheels = [ + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/bf/49/bf3b693025471431ab89252f287e2203394ef0965450af3fd1b995d39646/boxsdk-10.1.0-py3-none-any.whl", hash = "sha256:2770aa7111fdd6a14a6e6447ca2f3eeb306ed123b210368368e8ac938cfb7813", size = 556301, upload-time = "2025-11-19T11:32:00.168Z" }, +] + [[package]] name = "brotli" version = "1.2.0" @@ -4461,12 +4474,12 @@ name = "onnxruntime-gpu" version = "1.19.2" source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } dependencies = [ - { name = "coloredlogs" }, - { name = "flatbuffers" }, - { name = "numpy" }, - { name = "packaging" }, - { name = "protobuf" }, - { name = "sympy" }, + { name = "coloredlogs", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, + { name = "flatbuffers", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, + { name = "numpy", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, + { name = "packaging", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, + { name = "protobuf", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, + { name = "sympy", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, ] wheels = [ { url = "https://pypi.tuna.tsinghua.edu.cn/packages/85/33/06e856502a1d482532cfa7d4c7ca775dfddcd851c7bd1833f5177e567055/onnxruntime_gpu-1.19.2-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:562fc7c755393eaad9751e56149339dd201ffbfdb3ef5f43ff21d0619ba9045f", size = 226175096, upload-time = "2024-09-04T06:44:07.847Z" }, @@ -6186,6 +6199,7 @@ dependencies = [ { name = "blinker" }, { name = "boto3" }, { name = "botocore" }, + { name = "boxsdk" }, { name = "cachetools" }, { name = "captcha" }, { name = "chardet" }, @@ -6353,6 +6367,7 @@ requires-dist = [ { name = "blinker", specifier = "==1.7.0" }, { name = "boto3", specifier = "==1.34.140" }, { name = "botocore", specifier = "==1.34.140" }, + { name = "boxsdk", specifier = ">=10.1.0" }, { name = "cachetools", specifier = "==5.3.3" }, { name = "captcha", specifier = ">=0.7.1" }, { name = "chardet", specifier = "==5.2.0" }, diff --git a/web/src/assets/svg/data-source/box.svg b/web/src/assets/svg/data-source/box.svg new file mode 100644 index 000000000..5eec50e55 --- /dev/null +++ b/web/src/assets/svg/data-source/box.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/web/src/pages/user-setting/data-source/contant.tsx b/web/src/pages/user-setting/data-source/contant.tsx index b3e86e118..0e57c4a7f 100644 --- a/web/src/pages/user-setting/data-source/contant.tsx +++ b/web/src/pages/user-setting/data-source/contant.tsx @@ -1,10 +1,10 @@ import { FormFieldType } from '@/components/dynamic-form'; import SvgIcon from '@/components/svg-icon'; import { t } from 'i18next'; -import { ControllerRenderProps } from 'react-hook-form'; import { ConfluenceIndexingModeField } from './component/confluence-token-field'; import GmailTokenField from './component/gmail-token-field'; import GoogleDriveTokenField from './component/google-drive-token-field'; + export enum DataSourceKey { CONFLUENCE = 'confluence', S3 = 's3', @@ -15,6 +15,7 @@ export enum DataSourceKey { GMAIL = 'gmail', JIRA = 'jira', WEBDAV = 'webdav', + BOX = 'box', DROPBOX = 'dropbox', // SHAREPOINT = 'sharepoint', // SLACK = 'slack', @@ -72,6 +73,11 @@ export const DataSourceInfo = { description: t(`setting.${DataSourceKey.DROPBOX}Description`), icon: , }, + [DataSourceKey.BOX]: { + name: 'Box', + description: t(`setting.${DataSourceKey.BOX}Description`), + icon: , + }, }; export const DataSourceFormBaseFields = [ @@ -234,11 +240,11 @@ export const DataSourceFormFields = { { label: 'Index Method', name: 'config.index_mode', - type: FormFieldType.Text, // keep as text so RHF registers it + type: FormFieldType.Text, required: false, horizontal: true, labelClassName: 'self-start pt-4', - render: (fieldProps: ControllerRenderProps) => ( + render: (fieldProps: any) => ( ), }, @@ -551,6 +557,41 @@ export const DataSourceFormFields = { placeholder: 'Defaults to 2', }, ], + [DataSourceKey.BOX]: [ + { + label: 'Client ID', + name: 'config.credentials.box_client_id', + type: FormFieldType.Text, + required: true, + }, + { + label: 'Client Secret', + name: 'config.credentials.box_client_secret', + type: FormFieldType.Password, + required: true, + }, + { + label: 'Redirect URI', + name: 'config.credentials.box_redirect_uri', + type: FormFieldType.Text, + required: true, + placeholder: 'https://example.com/oauth2/callback', + }, + { + label: 'Folder ID', + name: 'config.folder_id', + type: FormFieldType.Text, + required: false, + placeholder: 'Defaults to root (0)', + }, + { + label: 'Index recursively', + name: 'config.index_recursively', + type: FormFieldType.Checkbox, + required: false, + defaultValue: false, + }, + ], }; export const DataSourceFormDefaultValues = { @@ -687,4 +728,18 @@ export const DataSourceFormDefaultValues = { }, }, }, + [DataSourceKey.BOX]: { + name: '', + source: DataSourceKey.BOX, + config: { + name: '', + folder_id: '0', + index_recursively: false, + credentials: { + box_client_id: '', + box_client_secret: '', + box_redirect_uri: '', + }, + }, + }, };