add box
This commit is contained in:
parent
bb6022477e
commit
fa7e3307f6
9 changed files with 282 additions and 12 deletions
|
|
@ -122,7 +122,7 @@ class FileSource(StrEnum):
|
|||
WEBDAV = "webdav"
|
||||
MOODLE = "moodle"
|
||||
DROPBOX = "dropbox"
|
||||
|
||||
BOX = "box"
|
||||
|
||||
class PipelineTaskType(StrEnum):
|
||||
PARSE = "Parse"
|
||||
|
|
|
|||
167
common/data_source/box_connector.py
Normal file
167
common/data_source/box_connector.py
Normal file
|
|
@ -0,0 +1,167 @@
|
|||
"""Box connector"""
|
||||
import os
|
||||
import logging
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any
|
||||
|
||||
from box_sdk_gen import BoxClient
|
||||
from common.data_source.config import DocumentSource, INDEX_BATCH_SIZE
|
||||
from common.data_source.exceptions import (
|
||||
ConnectorMissingCredentialError,
|
||||
ConnectorValidationError,
|
||||
)
|
||||
from common.data_source.interfaces import LoadConnector, PollConnector, SecondsSinceUnixEpoch
|
||||
from common.data_source.models import Document, GenerateDocumentsOutput
|
||||
from common.data_source.utils import get_file_ext
|
||||
|
||||
class BoxConnector(LoadConnector, PollConnector):
|
||||
def __init__(self, folder_id: str = "0", batch_size: int = INDEX_BATCH_SIZE, use_marker: bool = False) -> None:
|
||||
self.batch_size = batch_size
|
||||
self.folder_id = folder_id
|
||||
self.use_marker = use_marker
|
||||
|
||||
|
||||
def load_credentials(self, credentials):
|
||||
auth = credentials.get("auth")
|
||||
if not auth:
|
||||
raise ConnectorMissingCredentialError("Box auth is required")
|
||||
|
||||
self.box_client = BoxClient(auth=auth)
|
||||
return None
|
||||
|
||||
|
||||
def validate_connector_settings(self):
|
||||
if self.box_client is None:
|
||||
raise ConnectorMissingCredentialError("Box")
|
||||
|
||||
try:
|
||||
self.box_client.users.get_user_me()
|
||||
except Exception as e:
|
||||
logging.exception("[Box]: Failed to validate Box credentials")
|
||||
raise ConnectorValidationError(f"Unexpected error during Box settings validation: {e}")
|
||||
|
||||
|
||||
def _yield_files_recursive(
|
||||
self,
|
||||
folder_id,
|
||||
start: SecondsSinceUnixEpoch | None,
|
||||
end: SecondsSinceUnixEpoch | None
|
||||
) -> GenerateDocumentsOutput:
|
||||
|
||||
if self.box_client is None:
|
||||
raise ConnectorMissingCredentialError("Box")
|
||||
|
||||
result = self.box_client.folders.get_folder_items(
|
||||
folder_id=folder_id,
|
||||
limit=self.batch_size,
|
||||
usemarker=self.usemarker
|
||||
)
|
||||
|
||||
while True:
|
||||
batch: list[Document] = []
|
||||
for entry in result.entries:
|
||||
if entry.type == 'file' :
|
||||
file = self.box_client.files.get_file_by_id(
|
||||
entry.id
|
||||
)
|
||||
raw_time = (
|
||||
getattr(file, "created_at", None)
|
||||
or getattr(file, "content_created_at", None)
|
||||
)
|
||||
|
||||
if raw_time:
|
||||
modified_time = self._box_datetime_to_epoch_seconds(raw_time)
|
||||
if start is not None and modified_time <= start:
|
||||
continue
|
||||
if end is not None and modified_time > end:
|
||||
continue
|
||||
|
||||
content_bytes = self.box_client.downloads.download_file(file.id)
|
||||
|
||||
batch.append(
|
||||
Document(
|
||||
id=f"box:{file.id}",
|
||||
blob=content_bytes.read(),
|
||||
source=DocumentSource.BOX,
|
||||
semantic_identifier=file.name,
|
||||
extension=get_file_ext(file.name),
|
||||
doc_updated_at=modified_time,
|
||||
size_bytes=file.size,
|
||||
metadata=file.metadata
|
||||
)
|
||||
)
|
||||
elif entry.type == 'folder':
|
||||
yield from self._yield_files_recursive(folder_id=entry.id, start=start, end=end)
|
||||
|
||||
if batch:
|
||||
yield batch
|
||||
|
||||
if not result.next_marker:
|
||||
break
|
||||
|
||||
result = self.box_client.folders.get_folder_items(
|
||||
folder_id=folder_id,
|
||||
limit=self.batch_size,
|
||||
marker=result.next_marker,
|
||||
usemarker=True
|
||||
)
|
||||
|
||||
|
||||
def _box_datetime_to_epoch_seconds(self, dt: datetime) -> SecondsSinceUnixEpoch:
|
||||
"""Convert a Box SDK datetime to Unix epoch seconds (UTC).
|
||||
Only supports datetime; any non-datetime should be filtered out by caller.
|
||||
"""
|
||||
if not isinstance(dt, datetime):
|
||||
raise TypeError(f"box_datetime_to_epoch_seconds expects datetime, got {type(dt)}")
|
||||
|
||||
if dt.tzinfo is None:
|
||||
dt = dt.replace(tzinfo=timezone.utc)
|
||||
else:
|
||||
dt = dt.astimezone(timezone.utc)
|
||||
|
||||
return SecondsSinceUnixEpoch(int(dt.timestamp()))
|
||||
|
||||
|
||||
def poll_source(self, start, end):
|
||||
return self._yield_files_recursive(folder_id=self.folder_id, start=start, end=end)
|
||||
|
||||
|
||||
def load_from_state(self):
|
||||
return self._yield_files_recursive(folder_id=self.folder_id, start=None, end=None)
|
||||
|
||||
|
||||
# from flask import Flask, request, redirect
|
||||
|
||||
# from box_sdk_gen import BoxClient, BoxOAuth, OAuthConfig, GetAuthorizeUrlOptions
|
||||
|
||||
# app = Flask(__name__)
|
||||
|
||||
# AUTH = BoxOAuth(
|
||||
# OAuthConfig(client_id="8suvn9ik7qezsq2dub0ye6ubox61081z", client_secret="QScvhLgBcZrb2ck1QP1ovkutpRhI2QcN")
|
||||
# )
|
||||
|
||||
|
||||
# @app.route("/")
|
||||
# def get_auth():
|
||||
# auth_url = AUTH.get_authorize_url(
|
||||
# options=GetAuthorizeUrlOptions(redirect_uri="http://localhost:4999/oauth2callback")
|
||||
# )
|
||||
# return redirect(auth_url, code=302)
|
||||
|
||||
|
||||
# @app.route("/oauth2callback")
|
||||
# def callback():
|
||||
# AUTH.get_tokens_authorization_code_grant(request.args.get("code"))
|
||||
# box = BoxConnector()
|
||||
# box.load_credentials({"auth": AUTH})
|
||||
|
||||
# lst = []
|
||||
# for file in box.load_from_state():
|
||||
# for f in file:
|
||||
# lst.append(f.semantic_identifier)
|
||||
|
||||
# return lst
|
||||
|
||||
if __name__ == "__main__":
|
||||
pass
|
||||
# app.run(port=4999)
|
||||
|
|
@ -52,7 +52,7 @@ class DocumentSource(str, Enum):
|
|||
MOODLE = "moodle"
|
||||
S3_COMPATIBLE = "s3_compatible"
|
||||
DROPBOX = "dropbox"
|
||||
|
||||
BOX = "box"
|
||||
|
||||
class FileOrigin(str, Enum):
|
||||
"""File origins"""
|
||||
|
|
@ -227,6 +227,7 @@ _DEFAULT_PAGINATION_LIMIT = 1000
|
|||
_PROBLEMATIC_EXPANSIONS = "body.storage.value"
|
||||
_REPLACEMENT_EXPANSIONS = "body.view.value"
|
||||
|
||||
BOX_WEB_OAUTH_REDIRECT_URI = os.environ.get("BOX_WEB_OAUTH_REDIRECT_URI", "http://localhost:9380/v1/connector/box/oauth/web/callback")
|
||||
|
||||
class HtmlBasedConnectorTransformLinksStrategy(str, Enum):
|
||||
# remove links entirely
|
||||
|
|
|
|||
|
|
@ -49,7 +49,7 @@ MISSING_SCOPES_ERROR_STR = "client not authorized for any of the scopes requeste
|
|||
SCOPE_INSTRUCTIONS = ""
|
||||
|
||||
|
||||
GOOGLE_WEB_OAUTH_POPUP_TEMPLATE = """<!DOCTYPE html>
|
||||
WEB_OAUTH_POPUP_TEMPLATE = """<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="utf-8" />
|
||||
|
|
|
|||
|
|
@ -154,6 +154,7 @@ dependencies = [
|
|||
"exceptiongroup>=1.3.0,<2.0.0",
|
||||
"ffmpeg-python>=0.2.0",
|
||||
"imageio-ffmpeg>=0.6.0",
|
||||
"boxsdk>=10.1.0",
|
||||
]
|
||||
|
||||
[dependency-groups]
|
||||
|
|
|
|||
|
|
@ -42,6 +42,7 @@ from common.constants import FileSource, TaskStatus
|
|||
from common.data_source.config import INDEX_BATCH_SIZE
|
||||
from common.data_source.confluence_connector import ConfluenceConnector
|
||||
from common.data_source.gmail_connector import GmailConnector
|
||||
from common.data_source.box_connector import BoxConnector
|
||||
from common.data_source.interfaces import CheckpointOutputWrapper
|
||||
from common.data_source.utils import load_all_docs_from_checkpoint_connector
|
||||
from common.log_utils import init_root_logger
|
||||
|
|
@ -599,6 +600,34 @@ class Moodle(SyncBase):
|
|||
return document_generator
|
||||
|
||||
|
||||
class BOX(SyncBase):
|
||||
SOURCE_NAME: str = FileSource.BOX
|
||||
|
||||
async def _generate(self, task: dict):
|
||||
self.connector = BoxConnector(
|
||||
batch_size=self.conf.get("batch_size", INDEX_BATCH_SIZE),
|
||||
folder_id=self.conf.get("folder_id", "0"),
|
||||
use_marker=self.conf.get("use_marker", False)
|
||||
)
|
||||
|
||||
self.connector.load_credentials(self.conf["credentials"])
|
||||
if task["reindex"] == "1" or not task["poll_range_start"]:
|
||||
document_generator = self.connector.load_from_state()
|
||||
begin_info = "totally"
|
||||
else:
|
||||
poll_start = task["poll_range_start"]
|
||||
if poll_start is None:
|
||||
document_generator = self.connector.load_from_state()
|
||||
begin_info = "totally"
|
||||
else:
|
||||
document_generator = self.connector.poll_source(
|
||||
poll_start.timestamp(),
|
||||
datetime.now(timezone.utc).timestamp()
|
||||
)
|
||||
begin_info = "from {}".format(poll_start)
|
||||
logging.info("Connect to Box: folder_id({}) {}".format(self.conf["folder_id"], begin_info))
|
||||
return document_generator
|
||||
|
||||
func_factory = {
|
||||
FileSource.S3: S3,
|
||||
FileSource.NOTION: Notion,
|
||||
|
|
@ -613,6 +642,7 @@ func_factory = {
|
|||
FileSource.MOODLE: Moodle,
|
||||
FileSource.DROPBOX: Dropbox,
|
||||
FileSource.WEBDAV: WebDAV,
|
||||
FileSource.BOX: BOX
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
27
uv.lock
generated
27
uv.lock
generated
|
|
@ -643,6 +643,19 @@ wheels = [
|
|||
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/f5/23/91c8b50588470d80317f4afca93d3d542139bdc38ed5ad1b512fba416af3/botocore-1.34.140-py3-none-any.whl", hash = "sha256:43940d3a67d946ba3301631ba4078476a75f1015d4fb0fb0272d0b754b2cf9de", size = 12354845, upload-time = "2024-07-05T19:19:10.578Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "boxsdk"
|
||||
version = "10.1.0"
|
||||
source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }
|
||||
dependencies = [
|
||||
{ name = "requests" },
|
||||
{ name = "requests-toolbelt" },
|
||||
]
|
||||
sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/44/a6/b7afb8ee4745b61470322ada33c0463b26d221367bc23c8e8c7d4b7b6cbe/boxsdk-10.1.0.tar.gz", hash = "sha256:fb409b682d173eeb9a72c03ca0dddf2e66dbd79199235815a2ad61bf39c4f231", size = 265229, upload-time = "2025-11-19T11:32:01.438Z" }
|
||||
wheels = [
|
||||
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/bf/49/bf3b693025471431ab89252f287e2203394ef0965450af3fd1b995d39646/boxsdk-10.1.0-py3-none-any.whl", hash = "sha256:2770aa7111fdd6a14a6e6447ca2f3eeb306ed123b210368368e8ac938cfb7813", size = 556301, upload-time = "2025-11-19T11:32:00.168Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "brotli"
|
||||
version = "1.2.0"
|
||||
|
|
@ -4461,12 +4474,12 @@ name = "onnxruntime-gpu"
|
|||
version = "1.19.2"
|
||||
source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }
|
||||
dependencies = [
|
||||
{ name = "coloredlogs" },
|
||||
{ name = "flatbuffers" },
|
||||
{ name = "numpy" },
|
||||
{ name = "packaging" },
|
||||
{ name = "protobuf" },
|
||||
{ name = "sympy" },
|
||||
{ name = "coloredlogs", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
|
||||
{ name = "flatbuffers", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
|
||||
{ name = "numpy", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
|
||||
{ name = "packaging", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
|
||||
{ name = "protobuf", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
|
||||
{ name = "sympy", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
|
||||
]
|
||||
wheels = [
|
||||
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/85/33/06e856502a1d482532cfa7d4c7ca775dfddcd851c7bd1833f5177e567055/onnxruntime_gpu-1.19.2-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:562fc7c755393eaad9751e56149339dd201ffbfdb3ef5f43ff21d0619ba9045f", size = 226175096, upload-time = "2024-09-04T06:44:07.847Z" },
|
||||
|
|
@ -6186,6 +6199,7 @@ dependencies = [
|
|||
{ name = "blinker" },
|
||||
{ name = "boto3" },
|
||||
{ name = "botocore" },
|
||||
{ name = "boxsdk" },
|
||||
{ name = "cachetools" },
|
||||
{ name = "captcha" },
|
||||
{ name = "chardet" },
|
||||
|
|
@ -6353,6 +6367,7 @@ requires-dist = [
|
|||
{ name = "blinker", specifier = "==1.7.0" },
|
||||
{ name = "boto3", specifier = "==1.34.140" },
|
||||
{ name = "botocore", specifier = "==1.34.140" },
|
||||
{ name = "boxsdk", specifier = ">=10.1.0" },
|
||||
{ name = "cachetools", specifier = "==5.3.3" },
|
||||
{ name = "captcha", specifier = ">=0.7.1" },
|
||||
{ name = "chardet", specifier = "==5.2.0" },
|
||||
|
|
|
|||
1
web/src/assets/svg/data-source/box.svg
Normal file
1
web/src/assets/svg/data-source/box.svg
Normal file
|
|
@ -0,0 +1 @@
|
|||
<svg width="41" height="22" xmlns="http://www.w3.org/2000/svg"><path d="M39.7 19.2c.5.7.4 1.6-.2 2.1-.7.5-1.7.4-2.2-.2l-3.5-4.5-3.4 4.4c-.5.7-1.5.7-2.2.2-.7-.5-.8-1.4-.3-2.1l4-5.2-4-5.2c-.5-.7-.3-1.7.3-2.2.7-.5 1.7-.3 2.2.3l3.4 4.5L37.3 7c.5-.7 1.4-.8 2.2-.3.7.5.7 1.5.2 2.2L35.8 14l3.9 5.2zm-18.2-.6c-2.6 0-4.7-2-4.7-4.6 0-2.5 2.1-4.6 4.7-4.6s4.7 2.1 4.7 4.6c-.1 2.6-2.2 4.6-4.7 4.6zm-13.8 0c-2.6 0-4.7-2-4.7-4.6 0-2.5 2.1-4.6 4.7-4.6s4.7 2.1 4.7 4.6c0 2.6-2.1 4.6-4.7 4.6zM21.5 6.4c-2.9 0-5.5 1.6-6.8 4-1.3-2.4-3.9-4-6.9-4-1.8 0-3.4.6-4.7 1.5V1.5C3.1.7 2.4 0 1.6 0 .7 0 0 .7 0 1.5v12.6c.1 4.2 3.5 7.5 7.7 7.5 3 0 5.6-1.7 6.9-4.1 1.3 2.4 3.9 4.1 6.8 4.1 4.3 0 7.8-3.4 7.8-7.7.1-4.1-3.4-7.5-7.7-7.5z" fill="#0071F7"/></svg>
|
||||
|
After Width: | Height: | Size: 723 B |
|
|
@ -1,10 +1,10 @@
|
|||
import { FormFieldType } from '@/components/dynamic-form';
|
||||
import SvgIcon from '@/components/svg-icon';
|
||||
import { t } from 'i18next';
|
||||
import { ControllerRenderProps } from 'react-hook-form';
|
||||
import { ConfluenceIndexingModeField } from './component/confluence-token-field';
|
||||
import GmailTokenField from './component/gmail-token-field';
|
||||
import GoogleDriveTokenField from './component/google-drive-token-field';
|
||||
|
||||
export enum DataSourceKey {
|
||||
CONFLUENCE = 'confluence',
|
||||
S3 = 's3',
|
||||
|
|
@ -15,6 +15,7 @@ export enum DataSourceKey {
|
|||
GMAIL = 'gmail',
|
||||
JIRA = 'jira',
|
||||
WEBDAV = 'webdav',
|
||||
BOX = 'box',
|
||||
DROPBOX = 'dropbox',
|
||||
// SHAREPOINT = 'sharepoint',
|
||||
// SLACK = 'slack',
|
||||
|
|
@ -72,6 +73,11 @@ export const DataSourceInfo = {
|
|||
description: t(`setting.${DataSourceKey.DROPBOX}Description`),
|
||||
icon: <SvgIcon name={'data-source/dropbox'} width={38} />,
|
||||
},
|
||||
[DataSourceKey.BOX]: {
|
||||
name: 'Box',
|
||||
description: t(`setting.${DataSourceKey.BOX}Description`),
|
||||
icon: <SvgIcon name={'data-source/box'} width={38} />,
|
||||
},
|
||||
};
|
||||
|
||||
export const DataSourceFormBaseFields = [
|
||||
|
|
@ -234,11 +240,11 @@ export const DataSourceFormFields = {
|
|||
{
|
||||
label: 'Index Method',
|
||||
name: 'config.index_mode',
|
||||
type: FormFieldType.Text, // keep as text so RHF registers it
|
||||
type: FormFieldType.Text,
|
||||
required: false,
|
||||
horizontal: true,
|
||||
labelClassName: 'self-start pt-4',
|
||||
render: (fieldProps: ControllerRenderProps) => (
|
||||
render: (fieldProps: any) => (
|
||||
<ConfluenceIndexingModeField {...fieldProps} />
|
||||
),
|
||||
},
|
||||
|
|
@ -551,6 +557,41 @@ export const DataSourceFormFields = {
|
|||
placeholder: 'Defaults to 2',
|
||||
},
|
||||
],
|
||||
[DataSourceKey.BOX]: [
|
||||
{
|
||||
label: 'Client ID',
|
||||
name: 'config.credentials.box_client_id',
|
||||
type: FormFieldType.Text,
|
||||
required: true,
|
||||
},
|
||||
{
|
||||
label: 'Client Secret',
|
||||
name: 'config.credentials.box_client_secret',
|
||||
type: FormFieldType.Password,
|
||||
required: true,
|
||||
},
|
||||
{
|
||||
label: 'Redirect URI',
|
||||
name: 'config.credentials.box_redirect_uri',
|
||||
type: FormFieldType.Text,
|
||||
required: true,
|
||||
placeholder: 'https://example.com/oauth2/callback',
|
||||
},
|
||||
{
|
||||
label: 'Folder ID',
|
||||
name: 'config.folder_id',
|
||||
type: FormFieldType.Text,
|
||||
required: false,
|
||||
placeholder: 'Defaults to root (0)',
|
||||
},
|
||||
{
|
||||
label: 'Index recursively',
|
||||
name: 'config.index_recursively',
|
||||
type: FormFieldType.Checkbox,
|
||||
required: false,
|
||||
defaultValue: false,
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
export const DataSourceFormDefaultValues = {
|
||||
|
|
@ -687,4 +728,18 @@ export const DataSourceFormDefaultValues = {
|
|||
},
|
||||
},
|
||||
},
|
||||
[DataSourceKey.BOX]: {
|
||||
name: '',
|
||||
source: DataSourceKey.BOX,
|
||||
config: {
|
||||
name: '',
|
||||
folder_id: '0',
|
||||
index_recursively: false,
|
||||
credentials: {
|
||||
box_client_id: '',
|
||||
box_client_secret: '',
|
||||
box_redirect_uri: '',
|
||||
},
|
||||
},
|
||||
},
|
||||
};
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue