feat: Add Google Cloud Storage (GCS) support as object storage backend (#11716)

- Implement RAGFlowGCS class with full storage interface compliance
- Single bucket architecture with prefix-based logical separation
- Application Default Credentials (ADC) authentication support
- Add GCS to Storage enum and StorageFactory
- Add google-cloud-storage dependency to pyproject.toml
- Support for all standard operations: put, get, rm, copy, move, presigned URLs

Configuration:
- STORAGE_IMPL=GCS
- RAGFLOW__GCS__BUCKET=your-bucket-name
- GOOGLE_APPLICATION_CREDENTIALS=/path/to/credentials.json (optional)

IAM Requirements:
- Storage Object User (roles/storage.objectUser)
- Service Account Token Creator (roles/iam.serviceAccountTokenCreator)

Resolves #11716
This commit is contained in:
SmartDever02 2025-12-03 21:08:13 -03:00
parent 648342b62f
commit f896b8b6c8
4 changed files with 439 additions and 2 deletions

View file

@ -148,6 +148,7 @@ class Storage(Enum):
AWS_S3 = 4 AWS_S3 = 4
OSS = 5 OSS = 5
OPENDAL = 6 OPENDAL = 6
GCS = 7
# environment # environment
# ENV_STRONG_TEST_COUNT = "STRONG_TEST_COUNT" # ENV_STRONG_TEST_COUNT = "STRONG_TEST_COUNT"

View file

@ -35,6 +35,7 @@ from rag.utils.minio_conn import RAGFlowMinio
from rag.utils.opendal_conn import OpenDALStorage from rag.utils.opendal_conn import OpenDALStorage
from rag.utils.s3_conn import RAGFlowS3 from rag.utils.s3_conn import RAGFlowS3
from rag.utils.oss_conn import RAGFlowOSS from rag.utils.oss_conn import RAGFlowOSS
from rag.utils.gcs_conn import RAGFlowGCS
from rag.nlp import search from rag.nlp import search
@ -109,6 +110,7 @@ MINIO = {}
OB = {} OB = {}
OSS = {} OSS = {}
OS = {} OS = {}
GCS = {}
DOC_MAXIMUM_SIZE: int = 128 * 1024 * 1024 DOC_MAXIMUM_SIZE: int = 128 * 1024 * 1024
DOC_BULK_SIZE: int = 4 DOC_BULK_SIZE: int = 4
@ -151,7 +153,8 @@ class StorageFactory:
Storage.AZURE_SAS: RAGFlowAzureSasBlob, Storage.AZURE_SAS: RAGFlowAzureSasBlob,
Storage.AWS_S3: RAGFlowS3, Storage.AWS_S3: RAGFlowS3,
Storage.OSS: RAGFlowOSS, Storage.OSS: RAGFlowOSS,
Storage.OPENDAL: OpenDALStorage Storage.OPENDAL: OpenDALStorage,
Storage.GCS: RAGFlowGCS
} }
@classmethod @classmethod
@ -250,7 +253,7 @@ def init_settings():
else: else:
raise Exception(f"Not supported doc engine: {DOC_ENGINE}") raise Exception(f"Not supported doc engine: {DOC_ENGINE}")
global AZURE, S3, MINIO, OSS global AZURE, S3, MINIO, OSS, GCS
if STORAGE_IMPL_TYPE in ['AZURE_SPN', 'AZURE_SAS']: if STORAGE_IMPL_TYPE in ['AZURE_SPN', 'AZURE_SAS']:
AZURE = get_base_config("azure", {}) AZURE = get_base_config("azure", {})
elif STORAGE_IMPL_TYPE == 'AWS_S3': elif STORAGE_IMPL_TYPE == 'AWS_S3':
@ -259,6 +262,8 @@ def init_settings():
MINIO = decrypt_database_config(name="minio") MINIO = decrypt_database_config(name="minio")
elif STORAGE_IMPL_TYPE == 'OSS': elif STORAGE_IMPL_TYPE == 'OSS':
OSS = get_base_config("oss", {}) OSS = get_base_config("oss", {})
elif STORAGE_IMPL_TYPE == 'GCS':
GCS = get_base_config("gcs", {})
global STORAGE_IMPL global STORAGE_IMPL
STORAGE_IMPL = StorageFactory.create(Storage[STORAGE_IMPL_TYPE]) STORAGE_IMPL = StorageFactory.create(Storage[STORAGE_IMPL_TYPE])

View file

@ -44,6 +44,7 @@ dependencies = [
"flask-session==0.8.0", "flask-session==0.8.0",
"google-search-results==2.4.2", "google-search-results==2.4.2",
"google-auth-oauthlib>=1.2.0,<2.0.0", "google-auth-oauthlib>=1.2.0,<2.0.0",
"google-cloud-storage>=2.10.0,<3.0.0",
"groq==0.9.0", "groq==0.9.0",
"hanziconv==0.3.2", "hanziconv==0.3.2",
"html-text==0.6.2", "html-text==0.6.2",

430
rag/utils/gcs_conn.py Normal file
View file

@ -0,0 +1,430 @@
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""
Google Cloud Storage (GCS) Backend for RAGFlow
This module provides GCS support for RAGFlow's object storage layer.
It uses Application Default Credentials (ADC) for authentication and
implements a single-bucket architecture with logical prefix-based separation.
Configuration:
STORAGE_IMPL=GCS
RAGFLOW__GCS__BUCKET=your-gcs-bucket-name
GOOGLE_APPLICATION_CREDENTIALS=/path/to/credentials.json (optional if using Workload Identity)
IAM Requirements:
- Storage Object User (roles/storage.objectUser)
- Service Account Token Creator (roles/iam.serviceAccountTokenCreator) for presigned URLs
"""
import logging
import time
from io import BytesIO
from datetime import timedelta
from google.cloud import storage
from google.cloud.exceptions import NotFound, GoogleCloudError
from common.decorator import singleton
from common import settings
@singleton
class RAGFlowGCS:
"""
Google Cloud Storage backend implementation for RAGFlow.
This class implements the RAGFlow storage interface using GCS.
It uses a single GCS bucket with prefix-based logical separation
for different "buckets" (tenants, workspaces, etc.).
Architecture:
- Single physical GCS bucket (configured in settings)
- Logical "buckets" mapped to prefixes: {logical_bucket}/{filename}
- Example: put("rag-data", "file.txt") -> gs://bucket/rag-data/file.txt
Authentication:
Uses Application Default Credentials (ADC):
- GOOGLE_APPLICATION_CREDENTIALS environment variable
- Workload Identity (GKE/Cloud Run)
- gcloud CLI credentials
"""
def __init__(self):
"""Initialize GCS client and configuration."""
self.client = None
self.bucket_name = None
self.bucket = None
self.__open__()
def __open__(self):
"""
Initialize or reinitialize the GCS client connection.
This method:
1. Closes any existing connection
2. Loads configuration from settings
3. Creates a new GCS client using ADC
4. Gets a reference to the configured bucket
"""
try:
if self.client:
self.__close__()
except Exception:
pass
try:
# Load GCS configuration from settings
gcs_config = settings.GCS
self.bucket_name = gcs_config.get('bucket')
if not self.bucket_name:
raise ValueError("GCS bucket name not configured in settings")
# Initialize GCS client with Application Default Credentials
# This automatically uses:
# - GOOGLE_APPLICATION_CREDENTIALS env var
# - Workload Identity on GKE/Cloud Run
# - gcloud CLI credentials
self.client = storage.Client()
# Get bucket reference
self.bucket = self.client.bucket(self.bucket_name)
logging.info(f"GCS client initialized successfully for bucket: {self.bucket_name}")
except Exception as e:
logging.exception(f"Failed to connect to GCS bucket {self.bucket_name}: {e}")
raise
def __close__(self):
"""Close the GCS client connection."""
if self.client:
self.client.close()
self.client = None
self.bucket = None
def _get_blob_name(self, logical_bucket, filename):
"""
Convert logical bucket + filename to GCS blob name (prefix).
Args:
logical_bucket (str): Logical bucket name (used as prefix)
filename (str): File name within the logical bucket
Returns:
str: Full blob path in format: {logical_bucket}/{filename}
"""
return f"{logical_bucket}/{filename}"
def health(self):
"""
Health check: Verify GCS connection by writing a test object.
Returns:
bool: True if health check passes
Raises:
Exception: If health check fails
"""
logical_bucket = "txtxtxtxt1"
fnm = "txtxtxtxt1"
binary = b"_t@@@1"
try:
blob_name = self._get_blob_name(logical_bucket, fnm)
blob = self.bucket.blob(blob_name)
blob.upload_from_string(binary)
logging.info("GCS health check passed")
return True
except Exception as e:
logging.exception(f"GCS health check failed: {e}")
raise
def put(self, logical_bucket, fnm, binary, tenant_id=None):
"""
Upload an object to GCS.
Args:
logical_bucket (str): Logical bucket name (used as prefix)
fnm (str): Filename
binary (bytes): File content as bytes
tenant_id (str, optional): Tenant ID for logging
Returns:
True if successful
Raises:
Exception: If upload fails after retries
"""
blob_name = self._get_blob_name(logical_bucket, fnm)
for attempt in range(3):
try:
blob = self.bucket.blob(blob_name)
if isinstance(binary, bytes):
blob.upload_from_string(binary)
elif isinstance(binary, BytesIO):
binary.seek(0)
blob.upload_from_file(binary)
else:
# Try to handle as file-like object
blob.upload_from_file(BytesIO(binary))
logging.debug(f"Successfully uploaded {blob_name}")
return True
except Exception as e:
logging.warning(f"Failed to put {blob_name} (attempt {attempt + 1}/3): {e}")
if attempt < 2: # Don't reconnect on last attempt
self.__open__()
time.sleep(1)
else:
logging.exception(f"Failed to put {logical_bucket}/{fnm} after all retries")
raise
return False
def rm(self, logical_bucket, fnm, tenant_id=None):
"""
Delete an object from GCS.
Args:
logical_bucket (str): Logical bucket name (used as prefix)
fnm (str): Filename
tenant_id (str, optional): Tenant ID for logging
"""
blob_name = self._get_blob_name(logical_bucket, fnm)
try:
blob = self.bucket.blob(blob_name)
blob.delete()
logging.debug(f"Successfully deleted {blob_name}")
except NotFound:
logging.warning(f"Object not found for deletion: {blob_name}")
except Exception as e:
logging.exception(f"Failed to remove {logical_bucket}/{fnm}: {e}")
def get(self, logical_bucket, filename, tenant_id=None):
"""
Download an object from GCS.
Args:
logical_bucket (str): Logical bucket name (used as prefix)
filename (str): Filename
tenant_id (str, optional): Tenant ID for logging
Returns:
bytes: File content, or None if not found
"""
blob_name = self._get_blob_name(logical_bucket, filename)
for attempt in range(1):
try:
blob = self.bucket.blob(blob_name)
content = blob.download_as_bytes()
logging.debug(f"Successfully downloaded {blob_name}")
return content
except NotFound:
logging.warning(f"Object not found: {blob_name}")
return None
except Exception as e:
logging.exception(f"Failed to get {logical_bucket}/{filename}: {e}")
if attempt < 0: # Retry logic (currently only 1 attempt)
self.__open__()
time.sleep(1)
return None
def obj_exist(self, logical_bucket, filename, tenant_id=None):
"""
Check if an object exists in GCS.
Args:
logical_bucket (str): Logical bucket name (used as prefix)
filename (str): Filename
tenant_id (str, optional): Tenant ID for logging
Returns:
bool: True if object exists, False otherwise
"""
blob_name = self._get_blob_name(logical_bucket, filename)
try:
blob = self.bucket.blob(blob_name)
exists = blob.exists()
logging.debug(f"Object exists check for {blob_name}: {exists}")
return exists
except Exception as e:
logging.exception(f"Failed to check if object exists {logical_bucket}/{filename}: {e}")
return False
def bucket_exists(self, logical_bucket):
"""
Check if the physical GCS bucket exists.
Note: In this implementation, "logical buckets" are just prefixes,
so this always checks the physical bucket.
Args:
logical_bucket (str): Logical bucket name (ignored, checks physical bucket)
Returns:
bool: True if physical bucket exists
"""
try:
exists = self.bucket.exists()
logging.debug(f"Bucket exists check for {self.bucket_name}: {exists}")
return exists
except Exception as e:
logging.exception(f"Failed to check if bucket exists {self.bucket_name}: {e}")
return False
def get_presigned_url(self, logical_bucket, fnm, expires, tenant_id=None):
"""
Generate a presigned URL for temporary access to an object.
Args:
logical_bucket (str): Logical bucket name (used as prefix)
fnm (str): Filename
expires (int): Expiration time in seconds
tenant_id (str, optional): Tenant ID for logging
Returns:
str: Presigned URL, or None if generation fails
Note:
Requires Service Account Token Creator role for generating signed URLs
when running on GCP infrastructure.
"""
blob_name = self._get_blob_name(logical_bucket, fnm)
for attempt in range(10):
try:
blob = self.bucket.blob(blob_name)
# Generate signed URL with specified expiration
url = blob.generate_signed_url(
version="v4",
expiration=timedelta(seconds=expires),
method="GET"
)
logging.debug(f"Successfully generated presigned URL for {blob_name}")
return url
except Exception as e:
logging.warning(f"Failed to generate presigned URL for {blob_name} (attempt {attempt + 1}/10): {e}")
if attempt < 9: # Don't reconnect on last attempt
self.__open__()
time.sleep(1)
else:
logging.exception(f"Failed to generate presigned URL for {logical_bucket}/{fnm} after all retries")
return None
def remove_bucket(self, logical_bucket):
"""
Remove all objects with a given prefix (logical bucket).
Args:
logical_bucket (str): Logical bucket name (prefix to delete)
Warning:
This deletes ALL objects under the prefix. Use with caution!
"""
try:
# List all blobs with the prefix
prefix = f"{logical_bucket}/"
blobs = list(self.bucket.list_blobs(prefix=prefix))
logging.info(f"Removing {len(blobs)} objects with prefix {prefix}")
# Delete all blobs
for blob in blobs:
try:
blob.delete()
except Exception as e:
logging.error(f"Failed to delete {blob.name}: {e}")
logging.info(f"Successfully removed logical bucket {logical_bucket}")
except Exception as e:
logging.exception(f"Failed to remove logical bucket {logical_bucket}: {e}")
def copy(self, src_bucket, src_path, dest_bucket, dest_path):
"""
Copy an object from one location to another.
Args:
src_bucket (str): Source logical bucket
src_path (str): Source filename
dest_bucket (str): Destination logical bucket
dest_path (str): Destination filename
Returns:
bool: True if copy succeeded, False otherwise
"""
try:
src_blob_name = self._get_blob_name(src_bucket, src_path)
dest_blob_name = self._get_blob_name(dest_bucket, dest_path)
# Check if source exists
src_blob = self.bucket.blob(src_blob_name)
if not src_blob.exists():
logging.error(f"Source object not found: {src_blob_name}")
return False
# Copy the blob
dest_blob = self.bucket.blob(dest_blob_name)
self.bucket.copy_blob(src_blob, self.bucket, dest_blob_name)
logging.info(f"Successfully copied {src_blob_name} to {dest_blob_name}")
return True
except Exception as e:
logging.exception(f"Failed to copy {src_bucket}/{src_path} -> {dest_bucket}/{dest_path}: {e}")
return False
def move(self, src_bucket, src_path, dest_bucket, dest_path):
"""
Move an object from one location to another (copy + delete).
Args:
src_bucket (str): Source logical bucket
src_path (str): Source filename
dest_bucket (str): Destination logical bucket
dest_path (str): Destination filename
Returns:
bool: True if move succeeded, False otherwise
"""
try:
# Copy the object
if self.copy(src_bucket, src_path, dest_bucket, dest_path):
# Delete the source
self.rm(src_bucket, src_path)
logging.info(f"Successfully moved {src_bucket}/{src_path} to {dest_bucket}/{dest_path}")
return True
else:
logging.error(f"Copy failed, move aborted: {src_bucket}/{src_path}")
return False
except Exception as e:
logging.exception(f"Failed to move {src_bucket}/{src_path} -> {dest_bucket}/{dest_path}: {e}")
return False