This commit is contained in:
Billy Bao 2025-12-05 19:18:07 +08:00
parent bb6022477e
commit fa7e3307f6
9 changed files with 282 additions and 12 deletions

View file

@ -122,7 +122,7 @@ class FileSource(StrEnum):
WEBDAV = "webdav"
MOODLE = "moodle"
DROPBOX = "dropbox"
BOX = "box"
class PipelineTaskType(StrEnum):
PARSE = "Parse"

View file

@ -0,0 +1,167 @@
"""Box connector"""
import os
import logging
from datetime import datetime, timezone
from typing import Any
from box_sdk_gen import BoxClient
from common.data_source.config import DocumentSource, INDEX_BATCH_SIZE
from common.data_source.exceptions import (
ConnectorMissingCredentialError,
ConnectorValidationError,
)
from common.data_source.interfaces import LoadConnector, PollConnector, SecondsSinceUnixEpoch
from common.data_source.models import Document, GenerateDocumentsOutput
from common.data_source.utils import get_file_ext
class BoxConnector(LoadConnector, PollConnector):
def __init__(self, folder_id: str = "0", batch_size: int = INDEX_BATCH_SIZE, use_marker: bool = False) -> None:
self.batch_size = batch_size
self.folder_id = folder_id
self.use_marker = use_marker
def load_credentials(self, credentials):
auth = credentials.get("auth")
if not auth:
raise ConnectorMissingCredentialError("Box auth is required")
self.box_client = BoxClient(auth=auth)
return None
def validate_connector_settings(self):
if self.box_client is None:
raise ConnectorMissingCredentialError("Box")
try:
self.box_client.users.get_user_me()
except Exception as e:
logging.exception("[Box]: Failed to validate Box credentials")
raise ConnectorValidationError(f"Unexpected error during Box settings validation: {e}")
def _yield_files_recursive(
self,
folder_id,
start: SecondsSinceUnixEpoch | None,
end: SecondsSinceUnixEpoch | None
) -> GenerateDocumentsOutput:
if self.box_client is None:
raise ConnectorMissingCredentialError("Box")
result = self.box_client.folders.get_folder_items(
folder_id=folder_id,
limit=self.batch_size,
usemarker=self.usemarker
)
while True:
batch: list[Document] = []
for entry in result.entries:
if entry.type == 'file' :
file = self.box_client.files.get_file_by_id(
entry.id
)
raw_time = (
getattr(file, "created_at", None)
or getattr(file, "content_created_at", None)
)
if raw_time:
modified_time = self._box_datetime_to_epoch_seconds(raw_time)
if start is not None and modified_time <= start:
continue
if end is not None and modified_time > end:
continue
content_bytes = self.box_client.downloads.download_file(file.id)
batch.append(
Document(
id=f"box:{file.id}",
blob=content_bytes.read(),
source=DocumentSource.BOX,
semantic_identifier=file.name,
extension=get_file_ext(file.name),
doc_updated_at=modified_time,
size_bytes=file.size,
metadata=file.metadata
)
)
elif entry.type == 'folder':
yield from self._yield_files_recursive(folder_id=entry.id, start=start, end=end)
if batch:
yield batch
if not result.next_marker:
break
result = self.box_client.folders.get_folder_items(
folder_id=folder_id,
limit=self.batch_size,
marker=result.next_marker,
usemarker=True
)
def _box_datetime_to_epoch_seconds(self, dt: datetime) -> SecondsSinceUnixEpoch:
"""Convert a Box SDK datetime to Unix epoch seconds (UTC).
Only supports datetime; any non-datetime should be filtered out by caller.
"""
if not isinstance(dt, datetime):
raise TypeError(f"box_datetime_to_epoch_seconds expects datetime, got {type(dt)}")
if dt.tzinfo is None:
dt = dt.replace(tzinfo=timezone.utc)
else:
dt = dt.astimezone(timezone.utc)
return SecondsSinceUnixEpoch(int(dt.timestamp()))
def poll_source(self, start, end):
return self._yield_files_recursive(folder_id=self.folder_id, start=start, end=end)
def load_from_state(self):
return self._yield_files_recursive(folder_id=self.folder_id, start=None, end=None)
# from flask import Flask, request, redirect
# from box_sdk_gen import BoxClient, BoxOAuth, OAuthConfig, GetAuthorizeUrlOptions
# app = Flask(__name__)
# AUTH = BoxOAuth(
# OAuthConfig(client_id="8suvn9ik7qezsq2dub0ye6ubox61081z", client_secret="QScvhLgBcZrb2ck1QP1ovkutpRhI2QcN")
# )
# @app.route("/")
# def get_auth():
# auth_url = AUTH.get_authorize_url(
# options=GetAuthorizeUrlOptions(redirect_uri="http://localhost:4999/oauth2callback")
# )
# return redirect(auth_url, code=302)
# @app.route("/oauth2callback")
# def callback():
# AUTH.get_tokens_authorization_code_grant(request.args.get("code"))
# box = BoxConnector()
# box.load_credentials({"auth": AUTH})
# lst = []
# for file in box.load_from_state():
# for f in file:
# lst.append(f.semantic_identifier)
# return lst
if __name__ == "__main__":
pass
# app.run(port=4999)

View file

@ -52,7 +52,7 @@ class DocumentSource(str, Enum):
MOODLE = "moodle"
S3_COMPATIBLE = "s3_compatible"
DROPBOX = "dropbox"
BOX = "box"
class FileOrigin(str, Enum):
"""File origins"""
@ -227,6 +227,7 @@ _DEFAULT_PAGINATION_LIMIT = 1000
_PROBLEMATIC_EXPANSIONS = "body.storage.value"
_REPLACEMENT_EXPANSIONS = "body.view.value"
BOX_WEB_OAUTH_REDIRECT_URI = os.environ.get("BOX_WEB_OAUTH_REDIRECT_URI", "http://localhost:9380/v1/connector/box/oauth/web/callback")
class HtmlBasedConnectorTransformLinksStrategy(str, Enum):
# remove links entirely

View file

@ -49,7 +49,7 @@ MISSING_SCOPES_ERROR_STR = "client not authorized for any of the scopes requeste
SCOPE_INSTRUCTIONS = ""
GOOGLE_WEB_OAUTH_POPUP_TEMPLATE = """<!DOCTYPE html>
WEB_OAUTH_POPUP_TEMPLATE = """<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8" />

View file

@ -154,6 +154,7 @@ dependencies = [
"exceptiongroup>=1.3.0,<2.0.0",
"ffmpeg-python>=0.2.0",
"imageio-ffmpeg>=0.6.0",
"boxsdk>=10.1.0",
]
[dependency-groups]

View file

@ -42,6 +42,7 @@ from common.constants import FileSource, TaskStatus
from common.data_source.config import INDEX_BATCH_SIZE
from common.data_source.confluence_connector import ConfluenceConnector
from common.data_source.gmail_connector import GmailConnector
from common.data_source.box_connector import BoxConnector
from common.data_source.interfaces import CheckpointOutputWrapper
from common.data_source.utils import load_all_docs_from_checkpoint_connector
from common.log_utils import init_root_logger
@ -599,6 +600,34 @@ class Moodle(SyncBase):
return document_generator
class BOX(SyncBase):
SOURCE_NAME: str = FileSource.BOX
async def _generate(self, task: dict):
self.connector = BoxConnector(
batch_size=self.conf.get("batch_size", INDEX_BATCH_SIZE),
folder_id=self.conf.get("folder_id", "0"),
use_marker=self.conf.get("use_marker", False)
)
self.connector.load_credentials(self.conf["credentials"])
if task["reindex"] == "1" or not task["poll_range_start"]:
document_generator = self.connector.load_from_state()
begin_info = "totally"
else:
poll_start = task["poll_range_start"]
if poll_start is None:
document_generator = self.connector.load_from_state()
begin_info = "totally"
else:
document_generator = self.connector.poll_source(
poll_start.timestamp(),
datetime.now(timezone.utc).timestamp()
)
begin_info = "from {}".format(poll_start)
logging.info("Connect to Box: folder_id({}) {}".format(self.conf["folder_id"], begin_info))
return document_generator
func_factory = {
FileSource.S3: S3,
FileSource.NOTION: Notion,
@ -613,6 +642,7 @@ func_factory = {
FileSource.MOODLE: Moodle,
FileSource.DROPBOX: Dropbox,
FileSource.WEBDAV: WebDAV,
FileSource.BOX: BOX
}

27
uv.lock generated
View file

@ -643,6 +643,19 @@ wheels = [
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/f5/23/91c8b50588470d80317f4afca93d3d542139bdc38ed5ad1b512fba416af3/botocore-1.34.140-py3-none-any.whl", hash = "sha256:43940d3a67d946ba3301631ba4078476a75f1015d4fb0fb0272d0b754b2cf9de", size = 12354845, upload-time = "2024-07-05T19:19:10.578Z" },
]
[[package]]
name = "boxsdk"
version = "10.1.0"
source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }
dependencies = [
{ name = "requests" },
{ name = "requests-toolbelt" },
]
sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/44/a6/b7afb8ee4745b61470322ada33c0463b26d221367bc23c8e8c7d4b7b6cbe/boxsdk-10.1.0.tar.gz", hash = "sha256:fb409b682d173eeb9a72c03ca0dddf2e66dbd79199235815a2ad61bf39c4f231", size = 265229, upload-time = "2025-11-19T11:32:01.438Z" }
wheels = [
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/bf/49/bf3b693025471431ab89252f287e2203394ef0965450af3fd1b995d39646/boxsdk-10.1.0-py3-none-any.whl", hash = "sha256:2770aa7111fdd6a14a6e6447ca2f3eeb306ed123b210368368e8ac938cfb7813", size = 556301, upload-time = "2025-11-19T11:32:00.168Z" },
]
[[package]]
name = "brotli"
version = "1.2.0"
@ -4461,12 +4474,12 @@ name = "onnxruntime-gpu"
version = "1.19.2"
source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }
dependencies = [
{ name = "coloredlogs" },
{ name = "flatbuffers" },
{ name = "numpy" },
{ name = "packaging" },
{ name = "protobuf" },
{ name = "sympy" },
{ name = "coloredlogs", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
{ name = "flatbuffers", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
{ name = "numpy", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
{ name = "packaging", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
{ name = "protobuf", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
{ name = "sympy", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
]
wheels = [
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/85/33/06e856502a1d482532cfa7d4c7ca775dfddcd851c7bd1833f5177e567055/onnxruntime_gpu-1.19.2-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:562fc7c755393eaad9751e56149339dd201ffbfdb3ef5f43ff21d0619ba9045f", size = 226175096, upload-time = "2024-09-04T06:44:07.847Z" },
@ -6186,6 +6199,7 @@ dependencies = [
{ name = "blinker" },
{ name = "boto3" },
{ name = "botocore" },
{ name = "boxsdk" },
{ name = "cachetools" },
{ name = "captcha" },
{ name = "chardet" },
@ -6353,6 +6367,7 @@ requires-dist = [
{ name = "blinker", specifier = "==1.7.0" },
{ name = "boto3", specifier = "==1.34.140" },
{ name = "botocore", specifier = "==1.34.140" },
{ name = "boxsdk", specifier = ">=10.1.0" },
{ name = "cachetools", specifier = "==5.3.3" },
{ name = "captcha", specifier = ">=0.7.1" },
{ name = "chardet", specifier = "==5.2.0" },

View file

@ -0,0 +1 @@
<svg width="41" height="22" xmlns="http://www.w3.org/2000/svg"><path d="M39.7 19.2c.5.7.4 1.6-.2 2.1-.7.5-1.7.4-2.2-.2l-3.5-4.5-3.4 4.4c-.5.7-1.5.7-2.2.2-.7-.5-.8-1.4-.3-2.1l4-5.2-4-5.2c-.5-.7-.3-1.7.3-2.2.7-.5 1.7-.3 2.2.3l3.4 4.5L37.3 7c.5-.7 1.4-.8 2.2-.3.7.5.7 1.5.2 2.2L35.8 14l3.9 5.2zm-18.2-.6c-2.6 0-4.7-2-4.7-4.6 0-2.5 2.1-4.6 4.7-4.6s4.7 2.1 4.7 4.6c-.1 2.6-2.2 4.6-4.7 4.6zm-13.8 0c-2.6 0-4.7-2-4.7-4.6 0-2.5 2.1-4.6 4.7-4.6s4.7 2.1 4.7 4.6c0 2.6-2.1 4.6-4.7 4.6zM21.5 6.4c-2.9 0-5.5 1.6-6.8 4-1.3-2.4-3.9-4-6.9-4-1.8 0-3.4.6-4.7 1.5V1.5C3.1.7 2.4 0 1.6 0 .7 0 0 .7 0 1.5v12.6c.1 4.2 3.5 7.5 7.7 7.5 3 0 5.6-1.7 6.9-4.1 1.3 2.4 3.9 4.1 6.8 4.1 4.3 0 7.8-3.4 7.8-7.7.1-4.1-3.4-7.5-7.7-7.5z" fill="#0071F7"/></svg>

After

Width:  |  Height:  |  Size: 723 B

View file

@ -1,10 +1,10 @@
import { FormFieldType } from '@/components/dynamic-form';
import SvgIcon from '@/components/svg-icon';
import { t } from 'i18next';
import { ControllerRenderProps } from 'react-hook-form';
import { ConfluenceIndexingModeField } from './component/confluence-token-field';
import GmailTokenField from './component/gmail-token-field';
import GoogleDriveTokenField from './component/google-drive-token-field';
export enum DataSourceKey {
CONFLUENCE = 'confluence',
S3 = 's3',
@ -15,6 +15,7 @@ export enum DataSourceKey {
GMAIL = 'gmail',
JIRA = 'jira',
WEBDAV = 'webdav',
BOX = 'box',
DROPBOX = 'dropbox',
// SHAREPOINT = 'sharepoint',
// SLACK = 'slack',
@ -72,6 +73,11 @@ export const DataSourceInfo = {
description: t(`setting.${DataSourceKey.DROPBOX}Description`),
icon: <SvgIcon name={'data-source/dropbox'} width={38} />,
},
[DataSourceKey.BOX]: {
name: 'Box',
description: t(`setting.${DataSourceKey.BOX}Description`),
icon: <SvgIcon name={'data-source/box'} width={38} />,
},
};
export const DataSourceFormBaseFields = [
@ -234,11 +240,11 @@ export const DataSourceFormFields = {
{
label: 'Index Method',
name: 'config.index_mode',
type: FormFieldType.Text, // keep as text so RHF registers it
type: FormFieldType.Text,
required: false,
horizontal: true,
labelClassName: 'self-start pt-4',
render: (fieldProps: ControllerRenderProps) => (
render: (fieldProps: any) => (
<ConfluenceIndexingModeField {...fieldProps} />
),
},
@ -551,6 +557,41 @@ export const DataSourceFormFields = {
placeholder: 'Defaults to 2',
},
],
[DataSourceKey.BOX]: [
{
label: 'Client ID',
name: 'config.credentials.box_client_id',
type: FormFieldType.Text,
required: true,
},
{
label: 'Client Secret',
name: 'config.credentials.box_client_secret',
type: FormFieldType.Password,
required: true,
},
{
label: 'Redirect URI',
name: 'config.credentials.box_redirect_uri',
type: FormFieldType.Text,
required: true,
placeholder: 'https://example.com/oauth2/callback',
},
{
label: 'Folder ID',
name: 'config.folder_id',
type: FormFieldType.Text,
required: false,
placeholder: 'Defaults to root (0)',
},
{
label: 'Index recursively',
name: 'config.index_recursively',
type: FormFieldType.Checkbox,
required: false,
defaultValue: false,
},
],
};
export const DataSourceFormDefaultValues = {
@ -687,4 +728,18 @@ export const DataSourceFormDefaultValues = {
},
},
},
[DataSourceKey.BOX]: {
name: '',
source: DataSourceKey.BOX,
config: {
name: '',
folder_id: '0',
index_recursively: false,
credentials: {
box_client_id: '',
box_client_secret: '',
box_redirect_uri: '',
},
},
},
};