diff --git a/conf/service_conf.yaml b/conf/service_conf.yaml index 6b3cef80e..da2ee654e 100644 --- a/conf/service_conf.yaml +++ b/conf/service_conf.yaml @@ -17,6 +17,8 @@ minio: user: 'rag_flow' password: 'infini_rag_flow' host: 'localhost:9000' + bucket: '' + prefix_path: '' es: hosts: 'http://localhost:1200' username: 'elastic' diff --git a/docker/.env.single-bucket-example b/docker/.env.single-bucket-example new file mode 100644 index 000000000..96120b388 --- /dev/null +++ b/docker/.env.single-bucket-example @@ -0,0 +1,108 @@ +# Example: Single Bucket Mode Configuration +# +# This file shows how to configure RAGFlow to use a single MinIO/S3 bucket +# with directory structure instead of creating multiple buckets. + +# ============================================================================ +# MinIO/S3 Configuration for Single Bucket Mode +# ============================================================================ + +# MinIO/S3 Endpoint (with port if not default) +# For HTTPS (port 443), the connection will automatically use secure=True +export MINIO_HOST=minio.example.com:443 + +# Access credentials +export MINIO_USER=your-access-key +export MINIO_PASSWORD=your-secret-password-here + +# Single Bucket Configuration (NEW!) +# If set, all data will be stored in this bucket instead of creating +# separate buckets for each knowledge base +export MINIO_BUCKET=ragflow-bucket + +# Optional: Prefix path within the bucket (NEW!) +# If set, all files will be stored under this prefix +# Example: bucket/prefix_path/kb_id/file.pdf +export MINIO_PREFIX_PATH=ragflow + +# ============================================================================ +# Alternative: Multi-Bucket Mode (Default) +# ============================================================================ +# +# To use the original multi-bucket mode, simply don't set MINIO_BUCKET +# and MINIO_PREFIX_PATH: +# +# export MINIO_HOST=minio.local +# export MINIO_USER=admin +# export MINIO_PASSWORD=password +# # MINIO_BUCKET not set +# # MINIO_PREFIX_PATH not set + +# ============================================================================ +# Storage Mode Selection (Environment Variable) +# ============================================================================ +# +# Make sure this is set to use MinIO (default) +export STORAGE_IMPL=MINIO + +# ============================================================================ +# Example Path Structures +# ============================================================================ +# +# Multi-Bucket Mode (default): +# bucket: kb_12345/file.pdf +# bucket: kb_67890/file.pdf +# bucket: folder_abc/file.txt +# +# Single Bucket Mode (MINIO_BUCKET set): +# bucket: ragflow-bucket/kb_12345/file.pdf +# bucket: ragflow-bucket/kb_67890/file.pdf +# bucket: ragflow-bucket/folder_abc/file.txt +# +# Single Bucket with Prefix (both set): +# bucket: ragflow-bucket/ragflow/kb_12345/file.pdf +# bucket: ragflow-bucket/ragflow/kb_67890/file.pdf +# bucket: ragflow-bucket/ragflow/folder_abc/file.txt + +# ============================================================================ +# IAM Policy for Single Bucket Mode +# ============================================================================ +# +# When using single bucket mode, you only need permissions for one bucket: +# +# { +# "Version": "2012-10-17", +# "Statement": [ +# { +# "Effect": "Allow", +# "Action": ["s3:*"], +# "Resource": [ +# "arn:aws:s3:::ragflow-bucket", +# "arn:aws:s3:::ragflow-bucket/*" +# ] +# } +# ] +# } + +# ============================================================================ +# Testing the Configuration +# ============================================================================ +# +# After setting these variables, you can test with MinIO Client (mc): +# +# # Configure mc alias +# mc alias set ragflow https://minio.example.com:443 \ +# your-access-key \ +# your-secret-password-here +# +# # List bucket contents +# mc ls ragflow/ragflow-bucket/ +# +# # If prefix is set, check the prefix +# mc ls ragflow/ragflow-bucket/ragflow/ +# +# # Test write permission +# echo "test" | mc pipe ragflow/ragflow-bucket/ragflow/_test.txt +# +# # Clean up test file +# mc rm ragflow/ragflow-bucket/ragflow/_test.txt diff --git a/docker/service_conf.yaml.template b/docker/service_conf.yaml.template index fa85453ab..9bc6a08f6 100644 --- a/docker/service_conf.yaml.template +++ b/docker/service_conf.yaml.template @@ -1,22 +1,31 @@ ragflow: host: ${RAGFLOW_HOST:-0.0.0.0} http_port: 9380 + svr_http_port: 8080 admin: host: ${RAGFLOW_HOST:-0.0.0.0} http_port: 9381 -mysql: - name: '${MYSQL_DBNAME:-rag_flow}' - user: '${MYSQL_USER:-root}' - password: '${MYSQL_PASSWORD:-infini_rag_flow}' - host: '${MYSQL_HOST:-mysql}' - port: 3306 - max_connections: 900 - stale_timeout: 300 - max_allowed_packet: ${MYSQL_MAX_PACKET:-1073741824} +# mysql: +# name: '${MYSQL_DBNAME:-rag_flow}' +# user: '${MYSQL_USER:-root}' +# password: '${MYSQL_PASSWORD:-infini_rag_flow}' +# host: '${MYSQL_HOST:-mysql}' +# port: 3306 +# max_connections: 900 +# stale_timeout: 300 +# max_allowed_packet: ${MYSQL_MAX_PACKET:-1073741824} minio: user: '${MINIO_USER:-rag_flow}' password: '${MINIO_PASSWORD:-infini_rag_flow}' host: '${MINIO_HOST:-minio}:9000' + bucket: '${MINIO_BUCKET:-}' + prefix_path: '${MINIO_PREFIX_PATH:-}' +# s3: +# access_key: '${MINIO_USER}' +# secret_key: '${MINIO_PASSWORD}' +# endpoint_url: 'https://${MINIO_HOST}' +# bucket: 'ragflow-bucket' +# region: 'us-east-1' es: hosts: 'http://${ES_HOST:-es01}:9200' username: '${ES_USER:-elastic}' @@ -28,40 +37,19 @@ os: infinity: uri: '${INFINITY_HOST:-infinity}:23817' db_name: 'default_db' -oceanbase: - scheme: 'oceanbase' # set 'mysql' to create connection using mysql config - config: - db_name: '${OCEANBASE_DOC_DBNAME:-test}' - user: '${OCEANBASE_USER:-root@ragflow}' - password: '${OCEANBASE_PASSWORD:-infini_rag_flow}' - host: '${OCEANBASE_HOST:-oceanbase}' - port: ${OCEANBASE_PORT:-2881} redis: db: 1 password: '${REDIS_PASSWORD:-infini_rag_flow}' host: '${REDIS_HOST:-redis}:6379' -user_default_llm: - default_models: - embedding_model: - api_key: 'xxx' - base_url: 'http://${TEI_HOST}:80' -# postgres: -# name: '${POSTGRES_DBNAME:-rag_flow}' -# user: '${POSTGRES_USER:-rag_flow}' -# password: '${POSTGRES_PASSWORD:-infini_rag_flow}' -# host: '${POSTGRES_HOST:-postgres}' -# port: 5432 -# max_connections: 100 -# stale_timeout: 30 -# s3: -# access_key: 'access_key' -# secret_key: 'secret_key' -# region: 'region' -# endpoint_url: 'endpoint_url' -# bucket: 'bucket' -# prefix_path: 'prefix_path' -# signature_version: 'v4' -# addressing_style: 'path' + +postgres: + name: '${POSTGRES_DBNAME:-rag_flow}' + user: '${POSTGRES_USER:-rag_flow}' + password: '${POSTGRES_PASSWORD:-infini_rag_flow}' + host: '${POSTGRES_HOST:-postgres}' + port: 5432 + max_connections: 100 + stale_timeout: 30 # oss: # access_key: '${ACCESS_KEY}' # secret_key: '${SECRET_KEY}' @@ -80,50 +68,27 @@ user_default_llm: # secret: 'secret' # tenant_id: 'tenant_id' # container_name: 'container_name' -# The OSS object storage uses the MySQL configuration above by default. If you need to switch to another object storage service, please uncomment and configure the following parameters. -# opendal: -# scheme: 'mysql' # Storage type, such as s3, oss, azure, etc. -# config: -# oss_table: 'opendal_storage' # user_default_llm: -# factory: 'BAAI' -# api_key: 'backup' -# base_url: 'backup_base_url' +# factory: 'Tongyi-Qianwen' +# api_key: 'sk-xxxxxxxxxxxxx' +# base_url: '' # default_models: -# chat_model: -# name: 'qwen2.5-7b-instruct' -# factory: 'xxxx' -# api_key: 'xxxx' -# base_url: 'https://api.xx.com' -# embedding_model: -# name: 'bge-m3' -# rerank_model: 'bge-reranker-v2' -# asr_model: -# model: 'whisper-large-v3' # alias of name +# chat_model: 'qwen-plus' +# embedding_model: 'BAAI/bge-large-zh-v1.5@BAAI' +# rerank_model: '' +# asr_model: '' # image2text_model: '' # oauth: -# oauth2: -# display_name: "OAuth2" -# client_id: "your_client_id" -# client_secret: "your_client_secret" -# authorization_url: "https://your-oauth-provider.com/oauth/authorize" -# token_url: "https://your-oauth-provider.com/oauth/token" -# userinfo_url: "https://your-oauth-provider.com/oauth/userinfo" -# redirect_uri: "https://your-app.com/v1/user/oauth/callback/oauth2" -# oidc: -# display_name: "OIDC" -# client_id: "your_client_id" -# client_secret: "your_client_secret" -# issuer: "https://your-oauth-provider.com/oidc" -# scope: "openid email profile" -# redirect_uri: "https://your-app.com/v1/user/oauth/callback/oidc" # github: -# type: "github" -# icon: "github" -# display_name: "Github" -# client_id: "your_client_id" -# client_secret: "your_client_secret" -# redirect_uri: "https://your-app.com/v1/user/oauth/callback/github" +# client_id: xxxxxxxxxxxxxxxxxxxxxxxxx +# secret_key: xxxxxxxxxxxxxxxxxxxxxxxxxxxx +# url: https://github.com/login/oauth/access_token +# feishu: +# app_id: cli_xxxxxxxxxxxxxxxxxxx +# app_secret: xxxxxxxxxxxxxxxxxxxxxxxxxxxx +# app_access_token_url: https://open.feishu.cn/open-apis/auth/v3/app_access_token/internal +# user_access_token_url: https://open.feishu.cn/open-apis/authen/v1/oidc/access_token +# grant_type: 'authorization_code' # authentication: # client: # switch: false @@ -135,18 +100,3 @@ user_default_llm: # switch: false # component: false # dataset: false -# smtp: -# mail_server: "" -# mail_port: 465 -# mail_use_ssl: true -# mail_use_tls: false -# mail_username: "" -# mail_password: "" -# mail_default_sender: -# - "RAGFlow" # display name -# - "" # sender email address -# mail_frontend_url: "https://your-frontend.example.com" -# tcadp_config: -# secret_id: '${TENCENT_SECRET_ID}' -# secret_key: '${TENCENT_SECRET_KEY}' -# region: '${TENCENT_REGION}' diff --git a/docs/single-bucket-mode.md b/docs/single-bucket-mode.md new file mode 100644 index 000000000..08179cfcf --- /dev/null +++ b/docs/single-bucket-mode.md @@ -0,0 +1,162 @@ +# Single Bucket Mode for MinIO/S3 + +## Overview + +By default, RAGFlow creates one bucket per Knowledge Base (dataset) and one bucket per user folder. This can be problematic when: + +- Your cloud provider charges per bucket +- Your IAM policy restricts bucket creation +- You want all data organized in a single bucket with directory structure + +The **Single Bucket Mode** allows you to configure RAGFlow to use a single bucket with a directory structure instead of multiple buckets. + +## How It Works + +### Default Mode (Multiple Buckets) + +``` +bucket: kb_12345/ + └── document_1.pdf +bucket: kb_67890/ + └── document_2.pdf +bucket: folder_abc/ + └── file_3.txt +``` + +### Single Bucket Mode (with prefix_path) + +``` +bucket: ragflow-bucket/ + └── ragflow/ + ├── kb_12345/ + │ └── document_1.pdf + ├── kb_67890/ + │ └── document_2.pdf + └── folder_abc/ + └── file_3.txt +``` + +## Configuration + +### MinIO Configuration + +Edit your `service_conf.yaml` or set environment variables: + +```yaml +minio: + user: "your-access-key" + password: "your-secret-key" + host: "minio.example.com:443" + bucket: "ragflow-bucket" # Default bucket name + prefix_path: "ragflow" # Optional prefix path +``` + +Or using environment variables: + +```bash +export MINIO_USER=your-access-key +export MINIO_PASSWORD=your-secret-key +export MINIO_HOST=minio.example.com:443 +export MINIO_BUCKET=ragflow-bucket +export MINIO_PREFIX_PATH=ragflow +``` + +### S3 Configuration (already supported) + +```yaml +s3: + access_key: "your-access-key" + secret_key: "your-secret-key" + endpoint_url: "https://s3.amazonaws.com" + bucket: "my-ragflow-bucket" + prefix_path: "production" + region: "us-east-1" +``` + +## IAM Policy Example + +When using single bucket mode, you only need permissions for one bucket: + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": ["s3:*"], + "Resource": [ + "arn:aws:s3:::ragflow-bucket", + "arn:aws:s3:::ragflow-bucket/*" + ] + } + ] +} +``` + +## Migration from Multi-Bucket to Single Bucket + +If you're migrating from multi-bucket mode to single-bucket mode: + +1. **Set environment variables** for the new configuration +2. **Restart RAGFlow** services +3. **Migrate existing data** (optional): + +```bash +# Example using mc (MinIO Client) +mc alias set old-minio http://old-minio:9000 ACCESS_KEY SECRET_KEY +mc alias set new-minio https://new-minio:443 ACCESS_KEY SECRET_KEY + +# List all knowledge base buckets +mc ls old-minio/ | grep kb_ | while read -r line; do + bucket=$(echo $line | awk '{print $5}') + # Copy each bucket to the new structure + mc cp --recursive old-minio/$bucket/ new-minio/ragflow-bucket/ragflow/$bucket/ +done +``` + +## Toggle Between Modes + +### Enable Single Bucket Mode + +```yaml +minio: + bucket: "my-single-bucket" + prefix_path: "ragflow" +``` + +### Disable (Use Multi-Bucket Mode) + +```yaml +minio: + # Leave bucket and prefix_path empty or commented out + # bucket: '' + # prefix_path: '' +``` + +## Troubleshooting + +### Issue: Access Denied errors + +**Solution**: Ensure your IAM policy grants access to the bucket specified in the configuration. + +### Issue: Files not found after switching modes + +**Solution**: The path structure changes between modes. You'll need to migrate existing data. + +### Issue: Connection fails with HTTPS + +**Solution**: Ensure `secure: True` is set in the MinIO connection (automatically handled for port 443). + +## Storage Backends Supported + +- ✅ **MinIO** - Full support with single bucket mode +- ✅ **AWS S3** - Full support with single bucket mode +- ✅ **Alibaba OSS** - Full support with single bucket mode +- ✅ **Azure Blob** - Uses container-based structure (different paradigm) +- ⚠️ **OpenDAL** - Depends on underlying storage backend + +## Performance Considerations + +- **Single bucket mode** may have slightly better performance for bucket listing operations +- **Multi-bucket mode** provides better isolation and organization for large deployments +- Choose based on your specific requirements and infrastructure constraints diff --git a/rag/utils/minio_conn.py b/rag/utils/minio_conn.py index e0913e98b..343755109 100644 --- a/rag/utils/minio_conn.py +++ b/rag/utils/minio_conn.py @@ -28,8 +28,49 @@ from common import settings class RAGFlowMinio: def __init__(self): self.conn = None + self.bucket = settings.MINIO.get('bucket', None) + self.prefix_path = settings.MINIO.get('prefix_path', None) self.__open__() + @staticmethod + def use_default_bucket(method): + def wrapper(self, bucket, *args, **kwargs): + # If there is a default bucket, use the default bucket + # but preserve the original bucket identifier so it can be + # used as a path prefix inside the physical/default bucket. + original_bucket = bucket + actual_bucket = self.bucket if self.bucket else bucket + if self.bucket: + # pass original identifier forward for use by other decorators + kwargs['_orig_bucket'] = original_bucket + return method(self, actual_bucket, *args, **kwargs) + return wrapper + + @staticmethod + def use_prefix_path(method): + def wrapper(self, bucket, fnm, *args, **kwargs): + # If a default MINIO bucket is configured, the use_default_bucket + # decorator will have replaced the `bucket` arg with the physical + # bucket name and forwarded the original identifier as `_orig_bucket`. + # Prefer that original identifier when constructing the key path so + # objects are stored under //... + orig_bucket = kwargs.pop('_orig_bucket', None) + + if self.prefix_path: + # If a prefix_path is configured, include it and then the identifier + if orig_bucket: + fnm = f"{self.prefix_path}/{orig_bucket}/{fnm}" + else: + fnm = f"{self.prefix_path}/{fnm}" + else: + # No prefix_path configured. If orig_bucket exists and the + # physical bucket equals configured default, use orig_bucket as a path. + if orig_bucket and bucket == self.bucket: + fnm = f"{orig_bucket}/{fnm}" + + return method(self, bucket, fnm, *args, **kwargs) + return wrapper + def __open__(self): try: if self.conn: @@ -41,7 +82,7 @@ class RAGFlowMinio: self.conn = Minio(settings.MINIO["host"], access_key=settings.MINIO["user"], secret_key=settings.MINIO["password"], - secure=False + secure=True ) except Exception: logging.exception( @@ -52,20 +93,28 @@ class RAGFlowMinio: self.conn = None def health(self): - bucket, fnm, binary = "txtxtxtxt1", "txtxtxtxt1", b"_t@@@1" - if not self.conn.bucket_exists(bucket): - self.conn.make_bucket(bucket) + bucket = self.bucket if self.bucket else "ragflow-bucket" + fnm = "_health_check" + if self.prefix_path: + fnm = f"{self.prefix_path}/{fnm}" + binary = b"_t@@@1" + # Don't try to create bucket - it should already exist + # if not self.conn.bucket_exists(bucket): + # self.conn.make_bucket(bucket) r = self.conn.put_object(bucket, fnm, BytesIO(binary), len(binary) ) return r - def put(self, bucket, fnm, binary, tenant_id=None): + @use_default_bucket + @use_prefix_path + def put(self, bucket, fnm, binary): for _ in range(3): try: - if not self.conn.bucket_exists(bucket): - self.conn.make_bucket(bucket) + # Note: bucket must already exist - we don't have permission to create buckets + # if not self.conn.bucket_exists(bucket): + # self.conn.make_bucket(bucket) r = self.conn.put_object(bucket, fnm, BytesIO(binary), @@ -77,13 +126,17 @@ class RAGFlowMinio: self.__open__() time.sleep(1) - def rm(self, bucket, fnm, tenant_id=None): + @use_default_bucket + @use_prefix_path + def rm(self, bucket, fnm): try: self.conn.remove_object(bucket, fnm) except Exception: logging.exception(f"Fail to remove {bucket}/{fnm}:") - def get(self, bucket, filename, tenant_id=None): + @use_default_bucket + @use_prefix_path + def get(self, bucket, filename): for _ in range(1): try: r = self.conn.get_object(bucket, filename) @@ -92,9 +145,11 @@ class RAGFlowMinio: logging.exception(f"Fail to get {bucket}/{filename}") self.__open__() time.sleep(1) - return None + return - def obj_exist(self, bucket, filename, tenant_id=None): + @use_default_bucket + @use_prefix_path + def obj_exist(self, bucket, filename): try: if not self.conn.bucket_exists(bucket): return False @@ -109,6 +164,7 @@ class RAGFlowMinio: logging.exception(f"obj_exist {bucket}/{filename} got exception") return False + @use_default_bucket def bucket_exists(self, bucket): try: if not self.conn.bucket_exists(bucket): @@ -122,7 +178,9 @@ class RAGFlowMinio: logging.exception(f"bucket_exist {bucket} got exception") return False - def get_presigned_url(self, bucket, fnm, expires, tenant_id=None): + @use_default_bucket + @use_prefix_path + def get_presigned_url(self, bucket, fnm, expires): for _ in range(10): try: return self.conn.get_presigned_url("GET", bucket, fnm, expires) @@ -130,8 +188,9 @@ class RAGFlowMinio: logging.exception(f"Fail to get_presigned {bucket}/{fnm}:") self.__open__() time.sleep(1) - return None + return + @use_default_bucket def remove_bucket(self, bucket): try: if self.conn.bucket_exists(bucket): @@ -141,37 +200,3 @@ class RAGFlowMinio: self.conn.remove_bucket(bucket) except Exception: logging.exception(f"Fail to remove bucket {bucket}") - - def copy(self, src_bucket, src_path, dest_bucket, dest_path): - try: - if not self.conn.bucket_exists(dest_bucket): - self.conn.make_bucket(dest_bucket) - - try: - self.conn.stat_object(src_bucket, src_path) - except Exception as e: - logging.exception(f"Source object not found: {src_bucket}/{src_path}, {e}") - return False - - self.conn.copy_object( - dest_bucket, - dest_path, - CopySource(src_bucket, src_path), - ) - return True - - except Exception: - logging.exception(f"Fail to copy {src_bucket}/{src_path} -> {dest_bucket}/{dest_path}") - return False - - def move(self, src_bucket, src_path, dest_bucket, dest_path): - try: - if self.copy(src_bucket, src_path, dest_bucket, dest_path): - self.rm(src_bucket, src_path) - return True - else: - logging.error(f"Copy failed, move aborted: {src_bucket}/{src_path}") - return False - except Exception: - logging.exception(f"Fail to move {src_bucket}/{src_path} -> {dest_bucket}/{dest_path}") - return False