feat: Add Google Cloud Storage (GCS) support as object storage backend (#11716)

- Implement RAGFlowGCS class with full storage interface compliance - Single bucket architecture with prefix-based logical separation - Application Default Credentials (ADC) authentication support - Add GCS to Storage enum and StorageFactory - Add google-cloud-storage dependency to pyproject.toml - Support for all standard operations: put, get, rm, copy, move, presigned URLs Configuration: - STORAGE_IMPL=GCS - RAGFLOW__GCS__BUCKET=your-bucket-name - GOOGLE_APPLICATION_CREDENTIALS=/path/to/credentials.json (optional) IAM Requirements: - Storage Object User (roles/storage.objectUser) - Service Account Token Creator (roles/iam.serviceAccountTokenCreator) Resolves #11716
2025-12-03 21:08:13 -03:00 · 2025-12-03 21:08:13 -03:00 · f896b8b6c8
commit f896b8b6c8
parent 648342b62f
4 changed files with 439 additions and 2 deletions
--- a/common/constants.py
+++ b/common/constants.py
@ -148,6 +148,7 @@ class Storage(Enum):
    AWS_S3 = 4
    OSS = 5
    OPENDAL = 6
    GCS = 7
 # environment
 # ENV_STRONG_TEST_COUNT = "STRONG_TEST_COUNT"
--- a/common/settings.py
+++ b/common/settings.py
@ -35,6 +35,7 @@ from rag.utils.minio_conn import RAGFlowMinio
 from rag.utils.opendal_conn import OpenDALStorage
 from rag.utils.s3_conn import RAGFlowS3
 from rag.utils.oss_conn import RAGFlowOSS
 from rag.utils.gcs_conn import RAGFlowGCS
 from rag.nlp import search
@ -109,6 +110,7 @@ MINIO = {}
 OB = {}
 OSS = {}
 OS = {}
 GCS = {}
 DOC_MAXIMUM_SIZE: int = 128 * 1024 * 1024
 DOC_BULK_SIZE: int = 4
@ -151,7 +153,8 @@ class StorageFactory:
        Storage.AZURE_SAS: RAGFlowAzureSasBlob,
        Storage.AWS_S3: RAGFlowS3,
        Storage.OSS: RAGFlowOSS,
-        Storage.OPENDAL: OpenDALStorage
+        Storage.OPENDAL: OpenDALStorage,
        Storage.GCS: RAGFlowGCS
    }
    @classmethod
@ -250,7 +253,7 @@ def init_settings():
    else:
        raise Exception(f"Not supported doc engine: {DOC_ENGINE}")
-    global AZURE, S3, MINIO, OSS
+    global AZURE, S3, MINIO, OSS, GCS
    if STORAGE_IMPL_TYPE in ['AZURE_SPN', 'AZURE_SAS']:
        AZURE = get_base_config("azure", {})
    elif STORAGE_IMPL_TYPE == 'AWS_S3':
@ -259,6 +262,8 @@ def init_settings():
        MINIO = decrypt_database_config(name="minio")
    elif STORAGE_IMPL_TYPE == 'OSS':
        OSS = get_base_config("oss", {})
    elif STORAGE_IMPL_TYPE == 'GCS':
        GCS = get_base_config("gcs", {})
    global STORAGE_IMPL
    STORAGE_IMPL = StorageFactory.create(Storage[STORAGE_IMPL_TYPE])
--- a/pyproject.toml
+++ b/pyproject.toml
@ -44,6 +44,7 @@ dependencies = [
    "flask-session==0.8.0",
    "google-search-results==2.4.2",
    "google-auth-oauthlib>=1.2.0,<2.0.0",
    "google-cloud-storage>=2.10.0,<3.0.0",
    "groq==0.9.0",
    "hanziconv==0.3.2",
    "html-text==0.6.2",
--- a/rag/utils/gcs_conn.py
+++ b/rag/utils/gcs_conn.py
@ -0,0 +1,430 @@
 #
 #  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 #  You may obtain a copy of the License at
 #
 #      http://www.apache.org/licenses/LICENSE-2.0
 #
 #  Unless required by applicable law or agreed to in writing, software
 #  distributed under the License is distributed on an "AS IS" BASIS,
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
 """
 Google Cloud Storage (GCS) Backend for RAGFlow
 This module provides GCS support for RAGFlow's object storage layer.
 It uses Application Default Credentials (ADC) for authentication and
 implements a single-bucket architecture with logical prefix-based separation.
 Configuration:
    STORAGE_IMPL=GCS
    RAGFLOW__GCS__BUCKET=your-gcs-bucket-name
    GOOGLE_APPLICATION_CREDENTIALS=/path/to/credentials.json (optional if using Workload Identity)
 IAM Requirements:
    - Storage Object User (roles/storage.objectUser)
    - Service Account Token Creator (roles/iam.serviceAccountTokenCreator) for presigned URLs
 """
 import logging
 import time
 from io import BytesIO
 from datetime import timedelta
 from google.cloud import storage
 from google.cloud.exceptions import NotFound, GoogleCloudError
 from common.decorator import singleton
 from common import settings
@singleton
 class RAGFlowGCS:
    """
    Google Cloud Storage backend implementation for RAGFlow.
    This class implements the RAGFlow storage interface using GCS.
    It uses a single GCS bucket with prefix-based logical separation
    for different "buckets" (tenants, workspaces, etc.).
    Architecture:
        - Single physical GCS bucket (configured in settings)
        - Logical "buckets" mapped to prefixes: {logical_bucket}/{filename}
        - Example: put("rag-data", "file.txt") -> gs://bucket/rag-data/file.txt
    Authentication:
        Uses Application Default Credentials (ADC):
        - GOOGLE_APPLICATION_CREDENTIALS environment variable
        - Workload Identity (GKE/Cloud Run)
        - gcloud CLI credentials
    """
    def __init__(self):
        """Initialize GCS client and configuration."""
        self.client = None
        self.bucket_name = None
        self.bucket = None
        self.__open__()
    def __open__(self):
        """
        Initialize or reinitialize the GCS client connection.
        This method:
        1. Closes any existing connection
        2. Loads configuration from settings
        3. Creates a new GCS client using ADC
        4. Gets a reference to the configured bucket
        """
        try:
            if self.client:
                self.__close__()
        except Exception:
            pass
        try:
            # Load GCS configuration from settings
            gcs_config = settings.GCS
            self.bucket_name = gcs_config.get('bucket')
            if not self.bucket_name:
                raise ValueError("GCS bucket name not configured in settings")
            # Initialize GCS client with Application Default Credentials
            # This automatically uses:
            # - GOOGLE_APPLICATION_CREDENTIALS env var
            # - Workload Identity on GKE/Cloud Run
            # - gcloud CLI credentials
            self.client = storage.Client()
            # Get bucket reference
            self.bucket = self.client.bucket(self.bucket_name)
            logging.info(f"GCS client initialized successfully for bucket: {self.bucket_name}")
        except Exception as e:
            logging.exception(f"Failed to connect to GCS bucket {self.bucket_name}: {e}")
            raise
    def __close__(self):
        """Close the GCS client connection."""
        if self.client:
            self.client.close()
        self.client = None
        self.bucket = None
    def _get_blob_name(self, logical_bucket, filename):
        """
        Convert logical bucket + filename to GCS blob name (prefix).
        Args:
            logical_bucket (str): Logical bucket name (used as prefix)
            filename (str): File name within the logical bucket
        Returns:
            str: Full blob path in format: {logical_bucket}/{filename}
        """
        return f"{logical_bucket}/{filename}"
    def health(self):
        """
        Health check: Verify GCS connection by writing a test object.
        Returns:
            bool: True if health check passes
        Raises:
            Exception: If health check fails
        """
        logical_bucket = "txtxtxtxt1"
        fnm = "txtxtxtxt1"
        binary = b"_t@@@1"
        try:
            blob_name = self._get_blob_name(logical_bucket, fnm)
            blob = self.bucket.blob(blob_name)
            blob.upload_from_string(binary)
            logging.info("GCS health check passed")
            return True
        except Exception as e:
            logging.exception(f"GCS health check failed: {e}")
            raise
    def put(self, logical_bucket, fnm, binary, tenant_id=None):
        """
        Upload an object to GCS.
        Args:
            logical_bucket (str): Logical bucket name (used as prefix)
            fnm (str): Filename
            binary (bytes): File content as bytes
            tenant_id (str, optional): Tenant ID for logging
        Returns:
            True if successful
        Raises:
            Exception: If upload fails after retries
        """
        blob_name = self._get_blob_name(logical_bucket, fnm)
        for attempt in range(3):
            try:
                blob = self.bucket.blob(blob_name)
                if isinstance(binary, bytes):
                    blob.upload_from_string(binary)
                elif isinstance(binary, BytesIO):
                    binary.seek(0)
                    blob.upload_from_file(binary)
                else:
                    # Try to handle as file-like object
                    blob.upload_from_file(BytesIO(binary))
                logging.debug(f"Successfully uploaded {blob_name}")
                return True
            except Exception as e:
                logging.warning(f"Failed to put {blob_name} (attempt {attempt + 1}/3): {e}")
                if attempt < 2:  # Don't reconnect on last attempt
                    self.__open__()
                    time.sleep(1)
                else:
                    logging.exception(f"Failed to put {logical_bucket}/{fnm} after all retries")
                    raise
        return False
    def rm(self, logical_bucket, fnm, tenant_id=None):
        """
        Delete an object from GCS.
        Args:
            logical_bucket (str): Logical bucket name (used as prefix)
            fnm (str): Filename
            tenant_id (str, optional): Tenant ID for logging
        """
        blob_name = self._get_blob_name(logical_bucket, fnm)
        try:
            blob = self.bucket.blob(blob_name)
            blob.delete()
            logging.debug(f"Successfully deleted {blob_name}")
        except NotFound:
            logging.warning(f"Object not found for deletion: {blob_name}")
        except Exception as e:
            logging.exception(f"Failed to remove {logical_bucket}/{fnm}: {e}")
    def get(self, logical_bucket, filename, tenant_id=None):
        """
        Download an object from GCS.
        Args:
            logical_bucket (str): Logical bucket name (used as prefix)
            filename (str): Filename
            tenant_id (str, optional): Tenant ID for logging
        Returns:
            bytes: File content, or None if not found
        """
        blob_name = self._get_blob_name(logical_bucket, filename)
        for attempt in range(1):
            try:
                blob = self.bucket.blob(blob_name)
                content = blob.download_as_bytes()
                logging.debug(f"Successfully downloaded {blob_name}")
                return content
            except NotFound:
                logging.warning(f"Object not found: {blob_name}")
                return None
            except Exception as e:
                logging.exception(f"Failed to get {logical_bucket}/{filename}: {e}")
                if attempt < 0:  # Retry logic (currently only 1 attempt)
                    self.__open__()
                    time.sleep(1)
        return None
    def obj_exist(self, logical_bucket, filename, tenant_id=None):
        """
        Check if an object exists in GCS.
        Args:
            logical_bucket (str): Logical bucket name (used as prefix)
            filename (str): Filename
            tenant_id (str, optional): Tenant ID for logging
        Returns:
            bool: True if object exists, False otherwise
        """
        blob_name = self._get_blob_name(logical_bucket, filename)
        try:
            blob = self.bucket.blob(blob_name)
            exists = blob.exists()
            logging.debug(f"Object exists check for {blob_name}: {exists}")
            return exists
        except Exception as e:
            logging.exception(f"Failed to check if object exists {logical_bucket}/{filename}: {e}")
            return False
    def bucket_exists(self, logical_bucket):
        """
        Check if the physical GCS bucket exists.
        Note: In this implementation, "logical buckets" are just prefixes,
        so this always checks the physical bucket.
        Args:
            logical_bucket (str): Logical bucket name (ignored, checks physical bucket)
        Returns:
            bool: True if physical bucket exists
        """
        try:
            exists = self.bucket.exists()
            logging.debug(f"Bucket exists check for {self.bucket_name}: {exists}")
            return exists
        except Exception as e:
            logging.exception(f"Failed to check if bucket exists {self.bucket_name}: {e}")
            return False
    def get_presigned_url(self, logical_bucket, fnm, expires, tenant_id=None):
        """
        Generate a presigned URL for temporary access to an object.
        Args:
            logical_bucket (str): Logical bucket name (used as prefix)
            fnm (str): Filename
            expires (int): Expiration time in seconds
            tenant_id (str, optional): Tenant ID for logging
        Returns:
            str: Presigned URL, or None if generation fails
        Note:
            Requires Service Account Token Creator role for generating signed URLs
            when running on GCP infrastructure.
        """
        blob_name = self._get_blob_name(logical_bucket, fnm)
        for attempt in range(10):
            try:
                blob = self.bucket.blob(blob_name)
                # Generate signed URL with specified expiration
                url = blob.generate_signed_url(
                    version="v4",
                    expiration=timedelta(seconds=expires),
                    method="GET"
                )
                logging.debug(f"Successfully generated presigned URL for {blob_name}")
                return url
            except Exception as e:
                logging.warning(f"Failed to generate presigned URL for {blob_name} (attempt {attempt + 1}/10): {e}")
                if attempt < 9:  # Don't reconnect on last attempt
                    self.__open__()
                    time.sleep(1)
                else:
                    logging.exception(f"Failed to generate presigned URL for {logical_bucket}/{fnm} after all retries")
        return None
    def remove_bucket(self, logical_bucket):
        """
        Remove all objects with a given prefix (logical bucket).
        Args:
            logical_bucket (str): Logical bucket name (prefix to delete)
        Warning:
            This deletes ALL objects under the prefix. Use with caution!
        """
        try:
            # List all blobs with the prefix
            prefix = f"{logical_bucket}/"
            blobs = list(self.bucket.list_blobs(prefix=prefix))
            logging.info(f"Removing {len(blobs)} objects with prefix {prefix}")
            # Delete all blobs
            for blob in blobs:
                try:
                    blob.delete()
                except Exception as e:
                    logging.error(f"Failed to delete {blob.name}: {e}")
            logging.info(f"Successfully removed logical bucket {logical_bucket}")
        except Exception as e:
            logging.exception(f"Failed to remove logical bucket {logical_bucket}: {e}")
    def copy(self, src_bucket, src_path, dest_bucket, dest_path):
        """
        Copy an object from one location to another.
        Args:
            src_bucket (str): Source logical bucket
            src_path (str): Source filename
            dest_bucket (str): Destination logical bucket
            dest_path (str): Destination filename
        Returns:
            bool: True if copy succeeded, False otherwise
        """
        try:
            src_blob_name = self._get_blob_name(src_bucket, src_path)
            dest_blob_name = self._get_blob_name(dest_bucket, dest_path)
            # Check if source exists
            src_blob = self.bucket.blob(src_blob_name)
            if not src_blob.exists():
                logging.error(f"Source object not found: {src_blob_name}")
                return False
            # Copy the blob
            dest_blob = self.bucket.blob(dest_blob_name)
            self.bucket.copy_blob(src_blob, self.bucket, dest_blob_name)
            logging.info(f"Successfully copied {src_blob_name} to {dest_blob_name}")
            return True
        except Exception as e:
            logging.exception(f"Failed to copy {src_bucket}/{src_path} -> {dest_bucket}/{dest_path}: {e}")
            return False
    def move(self, src_bucket, src_path, dest_bucket, dest_path):
        """
        Move an object from one location to another (copy + delete).
        Args:
            src_bucket (str): Source logical bucket
            src_path (str): Source filename
            dest_bucket (str): Destination logical bucket
            dest_path (str): Destination filename
        Returns:
            bool: True if move succeeded, False otherwise
        """
        try:
            # Copy the object
            if self.copy(src_bucket, src_path, dest_bucket, dest_path):
                # Delete the source
                self.rm(src_bucket, src_path)
                logging.info(f"Successfully moved {src_bucket}/{src_path} to {dest_bucket}/{dest_path}")
                return True
            else:
                logging.error(f"Copy failed, move aborted: {src_bucket}/{src_path}")
                return False
        except Exception as e:
            logging.exception(f"Failed to move {src_bucket}/{src_path} -> {dest_bucket}/{dest_path}: {e}")
            return False