Add S3 storage client and API routes for document management: - Implement s3_routes.py with file upload, download, delete endpoints - Enhance s3_client.py with improved error handling and operations - Add S3 browser UI component with file viewing and management - Implement FileViewer and PDFViewer components for storage preview - Add Resizable and Sheet UI components for layout control Update backend infrastructure: - Add bulk operations and parameterized queries to postgres_impl.py - Enhance document routes with improved type hints - Update API server registration for new S3 routes - Refine upload routes and utility functions Modernize web UI: - Integrate S3 browser into main application layout - Update localization files for storage UI strings - Add storage settings to application configuration - Sync package dependencies and lock files Remove obsolete reproduction script: - Delete reproduce_citation.py (replaced by test suite) Update configuration: - Enhance pyrightconfig.json for stricter type checking
484 lines
16 KiB
Python
484 lines
16 KiB
Python
"""
|
|
Async S3 client wrapper for RustFS/MinIO/AWS S3 compatible object storage.
|
|
|
|
This module provides staging and archive functionality for documents:
|
|
- Upload to staging: s3://bucket/staging/{workspace}/{doc_id}
|
|
- Move to archive: s3://bucket/archive/{workspace}/{doc_id}
|
|
- Generate presigned URLs for citations
|
|
"""
|
|
|
|
import hashlib
|
|
import logging
|
|
import os
|
|
import threading
|
|
from contextlib import asynccontextmanager
|
|
from dataclasses import dataclass, field
|
|
from typing import Any, ClassVar
|
|
|
|
import pipmaster as pm
|
|
|
|
if not pm.is_installed("aioboto3"):
|
|
pm.install("aioboto3")
|
|
|
|
import aioboto3
|
|
from botocore.config import Config as BotoConfig
|
|
from botocore.exceptions import ClientError
|
|
from tenacity import (
|
|
before_sleep_log,
|
|
retry,
|
|
retry_if_exception_type,
|
|
stop_after_attempt,
|
|
wait_exponential,
|
|
)
|
|
|
|
from lightrag.utils import logger
|
|
|
|
# Constants with environment variable support
|
|
S3_ENDPOINT_URL = os.getenv("S3_ENDPOINT_URL", "")
|
|
S3_ACCESS_KEY_ID = os.getenv("S3_ACCESS_KEY_ID", "")
|
|
S3_SECRET_ACCESS_KEY = os.getenv("S3_SECRET_ACCESS_KEY", "")
|
|
S3_BUCKET_NAME = os.getenv("S3_BUCKET_NAME", "lightrag")
|
|
S3_REGION = os.getenv("S3_REGION", "us-east-1")
|
|
S3_RETRY_ATTEMPTS = int(os.getenv("S3_RETRY_ATTEMPTS", "3"))
|
|
S3_CONNECT_TIMEOUT = int(os.getenv("S3_CONNECT_TIMEOUT", "10"))
|
|
S3_READ_TIMEOUT = int(os.getenv("S3_READ_TIMEOUT", "30"))
|
|
S3_PRESIGNED_URL_EXPIRY = int(os.getenv("S3_PRESIGNED_URL_EXPIRY", "3600")) # 1 hour
|
|
|
|
|
|
# Retry decorator for S3 operations
|
|
s3_retry = retry(
|
|
stop=stop_after_attempt(S3_RETRY_ATTEMPTS),
|
|
wait=wait_exponential(multiplier=1, min=1, max=8),
|
|
retry=retry_if_exception_type(ClientError),
|
|
before_sleep=before_sleep_log(logger, logging.WARNING),
|
|
)
|
|
|
|
|
|
@dataclass
|
|
class S3Config:
|
|
"""Configuration for S3 client."""
|
|
|
|
endpoint_url: str = field(default_factory=lambda: S3_ENDPOINT_URL)
|
|
access_key_id: str = field(default_factory=lambda: S3_ACCESS_KEY_ID)
|
|
secret_access_key: str = field(default_factory=lambda: S3_SECRET_ACCESS_KEY)
|
|
bucket_name: str = field(default_factory=lambda: S3_BUCKET_NAME)
|
|
region: str = field(default_factory=lambda: S3_REGION)
|
|
connect_timeout: int = field(default_factory=lambda: S3_CONNECT_TIMEOUT)
|
|
read_timeout: int = field(default_factory=lambda: S3_READ_TIMEOUT)
|
|
presigned_url_expiry: int = field(default_factory=lambda: S3_PRESIGNED_URL_EXPIRY)
|
|
|
|
def __post_init__(self):
|
|
if not self.access_key_id or not self.secret_access_key:
|
|
raise ValueError(
|
|
"S3_ACCESS_KEY_ID and S3_SECRET_ACCESS_KEY must be set"
|
|
)
|
|
|
|
|
|
class S3ClientManager:
|
|
"""Shared S3 session manager to avoid creating multiple sessions."""
|
|
|
|
_sessions: ClassVar[dict[str, aioboto3.Session]] = {}
|
|
_session_refs: ClassVar[dict[str, int]] = {}
|
|
_lock: ClassVar[threading.Lock] = threading.Lock()
|
|
|
|
@classmethod
|
|
def get_session(cls, config: S3Config) -> aioboto3.Session:
|
|
"""Get or create a session for the given S3 config."""
|
|
# Use endpoint + access_key as session key
|
|
session_key = f"{config.endpoint_url}:{config.access_key_id}"
|
|
|
|
with cls._lock:
|
|
if session_key not in cls._sessions:
|
|
cls._sessions[session_key] = aioboto3.Session(
|
|
aws_access_key_id=config.access_key_id,
|
|
aws_secret_access_key=config.secret_access_key,
|
|
region_name=config.region,
|
|
)
|
|
cls._session_refs[session_key] = 0
|
|
logger.info(f"Created shared S3 session for {config.endpoint_url}")
|
|
|
|
cls._session_refs[session_key] += 1
|
|
logger.debug(
|
|
f"S3 session {session_key} reference count: {cls._session_refs[session_key]}"
|
|
)
|
|
|
|
return cls._sessions[session_key]
|
|
|
|
@classmethod
|
|
def release_session(cls, config: S3Config):
|
|
"""Release a reference to the session."""
|
|
session_key = f"{config.endpoint_url}:{config.access_key_id}"
|
|
|
|
with cls._lock:
|
|
if session_key in cls._session_refs:
|
|
cls._session_refs[session_key] -= 1
|
|
logger.debug(
|
|
f"S3 session {session_key} reference count: {cls._session_refs[session_key]}"
|
|
)
|
|
|
|
|
|
@dataclass
|
|
class S3Client:
|
|
"""
|
|
Async S3 client for document staging and archival.
|
|
|
|
Usage:
|
|
config = S3Config()
|
|
client = S3Client(config)
|
|
await client.initialize()
|
|
|
|
# Upload to staging
|
|
s3_key = await client.upload_to_staging(workspace, doc_id, content, filename)
|
|
|
|
# Move to archive after processing
|
|
archive_key = await client.move_to_archive(s3_key)
|
|
|
|
# Get presigned URL for citations
|
|
url = await client.get_presigned_url(archive_key)
|
|
|
|
await client.finalize()
|
|
"""
|
|
|
|
config: S3Config
|
|
_session: aioboto3.Session | None = field(default=None, init=False, repr=False)
|
|
_initialized: bool = field(default=False, init=False, repr=False)
|
|
|
|
async def initialize(self):
|
|
"""Initialize the S3 client."""
|
|
if self._initialized:
|
|
return
|
|
|
|
self._session = S3ClientManager.get_session(self.config)
|
|
|
|
# Ensure bucket exists
|
|
await self._ensure_bucket_exists()
|
|
|
|
self._initialized = True
|
|
logger.info(f"S3 client initialized for bucket: {self.config.bucket_name}")
|
|
|
|
async def finalize(self):
|
|
"""Release resources."""
|
|
if self._initialized:
|
|
S3ClientManager.release_session(self.config)
|
|
self._initialized = False
|
|
logger.info("S3 client finalized")
|
|
|
|
@asynccontextmanager
|
|
async def _get_client(self):
|
|
"""Get an S3 client from the session."""
|
|
if self._session is None:
|
|
raise RuntimeError("S3Client not initialized")
|
|
|
|
boto_config = BotoConfig(
|
|
connect_timeout=self.config.connect_timeout,
|
|
read_timeout=self.config.read_timeout,
|
|
retries={"max_attempts": S3_RETRY_ATTEMPTS},
|
|
)
|
|
|
|
async with self._session.client( # type: ignore
|
|
"s3",
|
|
endpoint_url=self.config.endpoint_url if self.config.endpoint_url else None,
|
|
config=boto_config,
|
|
) as client:
|
|
yield client
|
|
|
|
async def _ensure_bucket_exists(self):
|
|
"""Create bucket if it doesn't exist."""
|
|
async with self._get_client() as client:
|
|
try:
|
|
await client.head_bucket(Bucket=self.config.bucket_name)
|
|
logger.debug(f"Bucket {self.config.bucket_name} exists")
|
|
except ClientError as e:
|
|
error_code = e.response.get("Error", {}).get("Code", "")
|
|
if error_code in ("404", "NoSuchBucket"):
|
|
logger.info(f"Creating bucket: {self.config.bucket_name}")
|
|
await client.create_bucket(Bucket=self.config.bucket_name)
|
|
else:
|
|
raise
|
|
|
|
def _make_staging_key(self, workspace: str, doc_id: str, filename: str) -> str:
|
|
"""Generate S3 key for staging area."""
|
|
safe_filename = filename.replace("/", "_").replace("\\", "_")
|
|
return f"staging/{workspace}/{doc_id}/{safe_filename}"
|
|
|
|
def _make_archive_key(self, workspace: str, doc_id: str, filename: str) -> str:
|
|
"""Generate S3 key for archive area."""
|
|
safe_filename = filename.replace("/", "_").replace("\\", "_")
|
|
return f"archive/{workspace}/{doc_id}/{safe_filename}"
|
|
|
|
def _staging_to_archive_key(self, staging_key: str) -> str:
|
|
"""Convert staging key to archive key."""
|
|
if staging_key.startswith("staging/"):
|
|
return "archive/" + staging_key[8:]
|
|
return staging_key
|
|
|
|
@s3_retry
|
|
async def upload_to_staging(
|
|
self,
|
|
workspace: str,
|
|
doc_id: str,
|
|
content: bytes | str,
|
|
filename: str,
|
|
content_type: str = "application/octet-stream",
|
|
metadata: dict[str, str] | None = None,
|
|
) -> str:
|
|
"""
|
|
Upload document to staging area.
|
|
|
|
Args:
|
|
workspace: Workspace/tenant identifier
|
|
doc_id: Document ID
|
|
content: File content (bytes or string)
|
|
filename: Original filename
|
|
content_type: MIME type
|
|
metadata: Optional metadata dict
|
|
|
|
Returns:
|
|
S3 key for the uploaded object
|
|
"""
|
|
s3_key = self._make_staging_key(workspace, doc_id, filename)
|
|
|
|
if isinstance(content, str):
|
|
content = content.encode("utf-8")
|
|
|
|
# Calculate content hash for deduplication
|
|
content_hash = hashlib.sha256(content).hexdigest()
|
|
|
|
upload_metadata = {
|
|
"workspace": workspace,
|
|
"doc_id": doc_id,
|
|
"original_filename": filename,
|
|
"content_hash": content_hash,
|
|
**(metadata or {}),
|
|
}
|
|
|
|
async with self._get_client() as client:
|
|
await client.put_object(
|
|
Bucket=self.config.bucket_name,
|
|
Key=s3_key,
|
|
Body=content,
|
|
ContentType=content_type,
|
|
Metadata=upload_metadata,
|
|
)
|
|
|
|
logger.info(f"Uploaded to staging: {s3_key} ({len(content)} bytes)")
|
|
return s3_key
|
|
|
|
@s3_retry
|
|
async def get_object(self, s3_key: str) -> tuple[bytes, dict[str, Any]]:
|
|
"""
|
|
Get object content and metadata.
|
|
|
|
Returns:
|
|
Tuple of (content_bytes, metadata_dict)
|
|
"""
|
|
async with self._get_client() as client:
|
|
response = await client.get_object(
|
|
Bucket=self.config.bucket_name,
|
|
Key=s3_key,
|
|
)
|
|
content = await response["Body"].read()
|
|
metadata = response.get("Metadata", {})
|
|
|
|
logger.debug(f"Retrieved object: {s3_key} ({len(content)} bytes)")
|
|
return content, metadata
|
|
|
|
@s3_retry
|
|
async def move_to_archive(self, staging_key: str) -> str:
|
|
"""
|
|
Move object from staging to archive.
|
|
|
|
Args:
|
|
staging_key: Current S3 key in staging/
|
|
|
|
Returns:
|
|
New S3 key in archive/
|
|
"""
|
|
archive_key = self._staging_to_archive_key(staging_key)
|
|
|
|
async with self._get_client() as client:
|
|
# Copy to archive
|
|
await client.copy_object(
|
|
Bucket=self.config.bucket_name,
|
|
CopySource={"Bucket": self.config.bucket_name, "Key": staging_key},
|
|
Key=archive_key,
|
|
)
|
|
|
|
# Delete from staging
|
|
await client.delete_object(
|
|
Bucket=self.config.bucket_name,
|
|
Key=staging_key,
|
|
)
|
|
|
|
logger.info(f"Moved to archive: {staging_key} -> {archive_key}")
|
|
return archive_key
|
|
|
|
@s3_retry
|
|
async def delete_object(self, s3_key: str):
|
|
"""Delete an object."""
|
|
async with self._get_client() as client:
|
|
await client.delete_object(
|
|
Bucket=self.config.bucket_name,
|
|
Key=s3_key,
|
|
)
|
|
logger.info(f"Deleted object: {s3_key}")
|
|
|
|
@s3_retry
|
|
async def list_staging(self, workspace: str) -> list[dict[str, Any]]:
|
|
"""
|
|
List all objects in staging for a workspace.
|
|
|
|
Returns:
|
|
List of dicts with key, size, last_modified
|
|
"""
|
|
prefix = f"staging/{workspace}/"
|
|
objects = []
|
|
|
|
async with self._get_client() as client:
|
|
paginator = client.get_paginator("list_objects_v2")
|
|
async for page in paginator.paginate(
|
|
Bucket=self.config.bucket_name, Prefix=prefix
|
|
):
|
|
for obj in page.get("Contents", []):
|
|
objects.append(
|
|
{
|
|
"key": obj["Key"],
|
|
"size": obj["Size"],
|
|
"last_modified": obj["LastModified"].isoformat(),
|
|
}
|
|
)
|
|
|
|
return objects
|
|
|
|
async def get_presigned_url(
|
|
self, s3_key: str, expiry: int | None = None
|
|
) -> str:
|
|
"""
|
|
Generate a presigned URL for direct access.
|
|
|
|
Args:
|
|
s3_key: S3 object key
|
|
expiry: URL expiry in seconds (default from config)
|
|
|
|
Returns:
|
|
Presigned URL string
|
|
"""
|
|
expiry = expiry or self.config.presigned_url_expiry
|
|
|
|
async with self._get_client() as client:
|
|
url = await client.generate_presigned_url(
|
|
"get_object",
|
|
Params={"Bucket": self.config.bucket_name, "Key": s3_key},
|
|
ExpiresIn=expiry,
|
|
)
|
|
|
|
return url
|
|
|
|
async def object_exists(self, s3_key: str) -> bool:
|
|
"""Check if an object exists."""
|
|
async with self._get_client() as client:
|
|
try:
|
|
await client.head_object(
|
|
Bucket=self.config.bucket_name,
|
|
Key=s3_key,
|
|
)
|
|
return True
|
|
except ClientError as e:
|
|
if e.response.get("Error", {}).get("Code") == "404":
|
|
return False
|
|
raise
|
|
|
|
def get_s3_url(self, s3_key: str) -> str:
|
|
"""Get the S3 URL for an object (not presigned, for reference)."""
|
|
return f"s3://{self.config.bucket_name}/{s3_key}"
|
|
|
|
@s3_retry
|
|
async def list_objects(
|
|
self, prefix: str = "", delimiter: str = "/"
|
|
) -> dict[str, Any]:
|
|
"""
|
|
List objects and common prefixes (virtual folders) under a prefix.
|
|
|
|
Uses delimiter to group objects into virtual folders. This enables
|
|
folder-style navigation in the bucket browser.
|
|
|
|
Args:
|
|
prefix: S3 prefix to list under (e.g., "staging/default/")
|
|
delimiter: Delimiter for grouping (default "/" for folder navigation)
|
|
|
|
Returns:
|
|
Dict with:
|
|
- bucket: Bucket name
|
|
- prefix: The prefix that was listed
|
|
- folders: List of common prefixes (virtual folders)
|
|
- objects: List of dicts with key, size, last_modified, content_type
|
|
"""
|
|
folders: list[str] = []
|
|
objects: list[dict[str, Any]] = []
|
|
|
|
async with self._get_client() as client:
|
|
paginator = client.get_paginator("list_objects_v2")
|
|
async for page in paginator.paginate(
|
|
Bucket=self.config.bucket_name,
|
|
Prefix=prefix,
|
|
Delimiter=delimiter,
|
|
):
|
|
# Get common prefixes (virtual folders)
|
|
for cp in page.get("CommonPrefixes", []):
|
|
folders.append(cp["Prefix"])
|
|
|
|
# Get objects at this level
|
|
for obj in page.get("Contents", []):
|
|
# Skip the prefix itself if it's a "folder marker"
|
|
if obj["Key"] == prefix:
|
|
continue
|
|
objects.append(
|
|
{
|
|
"key": obj["Key"],
|
|
"size": obj["Size"],
|
|
"last_modified": obj["LastModified"].isoformat(),
|
|
"content_type": None, # Would need HEAD request for each
|
|
}
|
|
)
|
|
|
|
return {
|
|
"bucket": self.config.bucket_name,
|
|
"prefix": prefix,
|
|
"folders": folders,
|
|
"objects": objects,
|
|
}
|
|
|
|
@s3_retry
|
|
async def upload_object(
|
|
self,
|
|
key: str,
|
|
data: bytes,
|
|
content_type: str = "application/octet-stream",
|
|
metadata: dict[str, str] | None = None,
|
|
) -> str:
|
|
"""
|
|
Upload object to an arbitrary key path.
|
|
|
|
Unlike upload_to_staging which enforces a path structure, this method
|
|
allows uploading to any path in the bucket.
|
|
|
|
Args:
|
|
key: Full S3 key path (e.g., "staging/workspace/doc_id/file.txt")
|
|
data: File content as bytes
|
|
content_type: MIME type (default: application/octet-stream)
|
|
metadata: Optional metadata dict
|
|
|
|
Returns:
|
|
The S3 key where the object was uploaded
|
|
"""
|
|
async with self._get_client() as client:
|
|
await client.put_object(
|
|
Bucket=self.config.bucket_name,
|
|
Key=key,
|
|
Body=data,
|
|
ContentType=content_type,
|
|
Metadata=metadata or {},
|
|
)
|
|
|
|
logger.info(f"Uploaded object: {key} ({len(data)} bytes)")
|
|
return key
|