Add S3 storage client and API routes for document management: - Implement s3_routes.py with file upload, download, delete endpoints - Enhance s3_client.py with improved error handling and operations - Add S3 browser UI component with file viewing and management - Implement FileViewer and PDFViewer components for storage preview - Add Resizable and Sheet UI components for layout control Update backend infrastructure: - Add bulk operations and parameterized queries to postgres_impl.py - Enhance document routes with improved type hints - Update API server registration for new S3 routes - Refine upload routes and utility functions Modernize web UI: - Integrate S3 browser into main application layout - Update localization files for storage UI strings - Add storage settings to application configuration - Sync package dependencies and lock files Remove obsolete reproduction script: - Delete reproduce_citation.py (replaced by test suite) Update configuration: - Enhance pyrightconfig.json for stricter type checking
443 lines
16 KiB
Python
443 lines
16 KiB
Python
"""
|
|
Upload routes for S3/RustFS document staging.
|
|
|
|
This module provides endpoints for:
|
|
- Uploading documents to S3 staging
|
|
- Listing staged documents
|
|
- Getting presigned URLs
|
|
"""
|
|
|
|
import mimetypes
|
|
from typing import Annotated, Any, ClassVar, cast
|
|
|
|
from fastapi import (
|
|
APIRouter,
|
|
Depends,
|
|
File,
|
|
Form,
|
|
HTTPException,
|
|
UploadFile,
|
|
)
|
|
from pydantic import BaseModel, Field
|
|
|
|
from lightrag import LightRAG
|
|
from lightrag.api.utils_api import get_combined_auth_dependency
|
|
from lightrag.kg.postgres_impl import PGDocStatusStorage, PGKVStorage
|
|
from lightrag.storage.s3_client import S3Client
|
|
from lightrag.utils import compute_mdhash_id, logger
|
|
|
|
|
|
class UploadResponse(BaseModel):
|
|
"""Response model for document upload."""
|
|
|
|
status: str = Field(description='Upload status')
|
|
doc_id: str = Field(description='Document ID')
|
|
s3_key: str = Field(description='S3 object key')
|
|
s3_url: str = Field(description='S3 URL (s3://bucket/key)')
|
|
message: str | None = Field(default=None, description='Additional message')
|
|
|
|
class Config:
|
|
json_schema_extra: ClassVar[dict[str, Any]] = {
|
|
'example': {
|
|
'status': 'uploaded',
|
|
'doc_id': 'doc_abc123',
|
|
's3_key': 'staging/default/doc_abc123/report.pdf',
|
|
's3_url': 's3://lightrag/staging/default/doc_abc123/report.pdf',
|
|
'message': 'Document staged for processing',
|
|
}
|
|
}
|
|
|
|
|
|
class StagedDocument(BaseModel):
|
|
"""Model for a staged document."""
|
|
|
|
key: str = Field(description='S3 object key')
|
|
size: int = Field(description='File size in bytes')
|
|
last_modified: str = Field(description='Last modified timestamp')
|
|
|
|
|
|
class ListStagedResponse(BaseModel):
|
|
"""Response model for listing staged documents."""
|
|
|
|
workspace: str = Field(description='Workspace name')
|
|
documents: list[StagedDocument] = Field(description='List of staged documents')
|
|
count: int = Field(description='Number of documents')
|
|
|
|
|
|
class PresignedUrlResponse(BaseModel):
|
|
"""Response model for presigned URL."""
|
|
|
|
s3_key: str = Field(description='S3 object key')
|
|
presigned_url: str = Field(description='Presigned URL for direct access')
|
|
expiry_seconds: int = Field(description='URL expiry time in seconds')
|
|
|
|
|
|
class ProcessS3Request(BaseModel):
|
|
"""Request model for processing a document from S3 staging."""
|
|
|
|
s3_key: str = Field(description='S3 key of the staged document')
|
|
doc_id: str | None = Field(
|
|
default=None,
|
|
description='Document ID (extracted from s3_key if not provided)',
|
|
)
|
|
archive_after_processing: bool = Field(
|
|
default=True,
|
|
description='Move document to archive after successful processing',
|
|
)
|
|
|
|
class Config:
|
|
json_schema_extra: ClassVar[dict[str, Any]] = {
|
|
'example': {
|
|
's3_key': 'staging/default/doc_abc123/report.pdf',
|
|
'doc_id': 'doc_abc123',
|
|
'archive_after_processing': True,
|
|
}
|
|
}
|
|
|
|
|
|
class ProcessS3Response(BaseModel):
|
|
"""Response model for S3 document processing."""
|
|
|
|
status: str = Field(description='Processing status')
|
|
track_id: str = Field(description='Track ID for monitoring processing progress')
|
|
doc_id: str = Field(description='Document ID')
|
|
s3_key: str = Field(description='Original S3 key')
|
|
archive_key: str | None = Field(default=None, description='Archive S3 key (if archived)')
|
|
message: str | None = Field(default=None, description='Additional message')
|
|
|
|
class Config:
|
|
json_schema_extra: ClassVar[dict[str, Any]] = {
|
|
'example': {
|
|
'status': 'processing_started',
|
|
'track_id': 'insert_20250101_120000_abc123',
|
|
'doc_id': 'doc_abc123',
|
|
's3_key': 'staging/default/doc_abc123/report.pdf',
|
|
'archive_key': 'archive/default/doc_abc123/report.pdf',
|
|
'message': 'Document processing started',
|
|
}
|
|
}
|
|
|
|
|
|
def create_upload_routes(
|
|
rag: LightRAG,
|
|
s3_client: S3Client,
|
|
api_key: str | None = None,
|
|
) -> APIRouter:
|
|
"""
|
|
Create upload routes for S3 document staging.
|
|
|
|
Args:
|
|
rag: LightRAG instance
|
|
s3_client: Initialized S3Client instance
|
|
api_key: Optional API key for authentication
|
|
|
|
Returns:
|
|
FastAPI router with upload endpoints
|
|
"""
|
|
router = APIRouter(
|
|
prefix='/upload',
|
|
tags=['upload'],
|
|
)
|
|
|
|
optional_api_key = get_combined_auth_dependency(api_key)
|
|
|
|
@router.post(
|
|
'',
|
|
response_model=UploadResponse,
|
|
summary='Upload document to S3 staging',
|
|
description="""
|
|
Upload a document to S3/RustFS staging area.
|
|
|
|
The document will be staged at: s3://bucket/staging/{workspace}/{doc_id}/{filename}
|
|
|
|
After upload, the document can be processed by calling the standard document
|
|
processing endpoints, which will:
|
|
1. Fetch the document from S3 staging
|
|
2. Process it through the RAG pipeline
|
|
3. Move it to S3 archive
|
|
4. Store processed data in PostgreSQL
|
|
""",
|
|
)
|
|
async def upload_document(
|
|
file: Annotated[UploadFile, File(description='Document file to upload')],
|
|
workspace: Annotated[str, Form(description='Workspace name')] = 'default',
|
|
doc_id: Annotated[str | None, Form(description='Optional document ID (auto-generated if not provided)')] = None,
|
|
_: Annotated[bool, Depends(optional_api_key)] = True,
|
|
) -> UploadResponse:
|
|
"""Upload a document to S3 staging."""
|
|
try:
|
|
# Read file content
|
|
content = await file.read()
|
|
|
|
if not content:
|
|
raise HTTPException(status_code=400, detail='Empty file')
|
|
|
|
# Generate doc_id if not provided
|
|
if not doc_id:
|
|
doc_id = compute_mdhash_id(content, prefix='doc_')
|
|
|
|
# Determine content type
|
|
final_content_type = file.content_type
|
|
if not final_content_type:
|
|
guessed_type, encoding = mimetypes.guess_type(file.filename or '')
|
|
final_content_type = guessed_type or 'application/octet-stream'
|
|
|
|
# Upload to S3 staging
|
|
s3_key = await s3_client.upload_to_staging(
|
|
workspace=workspace,
|
|
doc_id=doc_id,
|
|
content=content,
|
|
filename=file.filename or f'{doc_id}.bin',
|
|
content_type=final_content_type,
|
|
metadata={
|
|
'original_size': str(len(content)),
|
|
'content_type': final_content_type,
|
|
},
|
|
)
|
|
|
|
s3_url = s3_client.get_s3_url(s3_key)
|
|
|
|
logger.info(f'Document uploaded to staging: {s3_key}')
|
|
|
|
return UploadResponse(
|
|
status='uploaded',
|
|
doc_id=doc_id,
|
|
s3_key=s3_key,
|
|
s3_url=s3_url,
|
|
message='Document staged for processing',
|
|
)
|
|
|
|
except HTTPException:
|
|
raise
|
|
except Exception as e:
|
|
logger.error(f'Upload failed: {e}')
|
|
raise HTTPException(status_code=500, detail=f'Upload failed: {e}') from e
|
|
|
|
@router.get(
|
|
'/staged',
|
|
response_model=ListStagedResponse,
|
|
summary='List staged documents',
|
|
description='List all documents in the staging area for a workspace.',
|
|
)
|
|
async def list_staged(
|
|
workspace: str = 'default',
|
|
_: Annotated[bool, Depends(optional_api_key)] = True,
|
|
) -> ListStagedResponse:
|
|
"""List documents in staging."""
|
|
try:
|
|
objects = await s3_client.list_staging(workspace)
|
|
|
|
documents = [
|
|
StagedDocument(
|
|
key=obj['key'],
|
|
size=obj['size'],
|
|
last_modified=obj['last_modified'],
|
|
)
|
|
for obj in objects
|
|
]
|
|
|
|
return ListStagedResponse(
|
|
workspace=workspace,
|
|
documents=documents,
|
|
count=len(documents),
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f'Failed to list staged documents: {e}')
|
|
raise HTTPException(status_code=500, detail=f'Failed to list staged documents: {e}') from e
|
|
|
|
@router.get(
|
|
'/presigned-url',
|
|
response_model=PresignedUrlResponse,
|
|
summary='Get presigned URL',
|
|
description='Generate a presigned URL for direct access to a document in S3.',
|
|
)
|
|
async def get_presigned_url(
|
|
s3_key: str,
|
|
expiry: int = 3600,
|
|
_: Annotated[bool, Depends(optional_api_key)] = True,
|
|
) -> PresignedUrlResponse:
|
|
"""Get presigned URL for a document."""
|
|
try:
|
|
# Verify object exists
|
|
if not await s3_client.object_exists(s3_key):
|
|
raise HTTPException(status_code=404, detail='Object not found')
|
|
|
|
url = await s3_client.get_presigned_url(s3_key, expiry=expiry)
|
|
|
|
return PresignedUrlResponse(
|
|
s3_key=s3_key,
|
|
presigned_url=url,
|
|
expiry_seconds=expiry,
|
|
)
|
|
|
|
except HTTPException:
|
|
raise
|
|
except Exception as e:
|
|
logger.error(f'Failed to generate presigned URL: {e}')
|
|
raise HTTPException(status_code=500, detail=f'Failed to generate presigned URL: {e}') from e
|
|
|
|
@router.delete(
|
|
'/staged/{doc_id}',
|
|
summary='Delete staged document',
|
|
description='Delete a document from the staging area.',
|
|
)
|
|
async def delete_staged(
|
|
doc_id: str,
|
|
workspace: str = 'default',
|
|
_: Annotated[bool, Depends(optional_api_key)] = True,
|
|
) -> dict[str, str]:
|
|
"""Delete a staged document."""
|
|
try:
|
|
# List objects with this doc_id prefix
|
|
prefix = f'staging/{workspace}/{doc_id}/'
|
|
objects = await s3_client.list_staging(workspace)
|
|
|
|
# Filter to this doc_id
|
|
to_delete = [obj['key'] for obj in objects if obj['key'].startswith(prefix)]
|
|
|
|
if not to_delete:
|
|
raise HTTPException(status_code=404, detail='Document not found in staging')
|
|
|
|
# Delete each object
|
|
for key in to_delete:
|
|
await s3_client.delete_object(key)
|
|
|
|
return {
|
|
'status': 'deleted',
|
|
'doc_id': doc_id,
|
|
'deleted_count': str(len(to_delete)),
|
|
}
|
|
|
|
except HTTPException:
|
|
raise
|
|
except Exception as e:
|
|
logger.error(f'Failed to delete staged document: {e}')
|
|
raise HTTPException(status_code=500, detail=f'Failed to delete staged document: {e}') from e
|
|
|
|
@router.post(
|
|
'/process',
|
|
response_model=ProcessS3Response,
|
|
summary='Process document from S3 staging',
|
|
description="""
|
|
Fetch a document from S3 staging and process it through the RAG pipeline.
|
|
|
|
This endpoint:
|
|
1. Fetches the document content from S3 staging
|
|
2. Processes it through the RAG pipeline (chunking, entity extraction, embedding)
|
|
3. Stores processed data in PostgreSQL with s3_key reference
|
|
4. Optionally moves the document from staging to archive
|
|
|
|
The s3_key should be the full key returned from the upload endpoint,
|
|
e.g., "staging/default/doc_abc123/report.pdf"
|
|
""",
|
|
)
|
|
async def process_from_s3(
|
|
request: ProcessS3Request,
|
|
_: Annotated[bool, Depends(optional_api_key)] = True,
|
|
) -> ProcessS3Response:
|
|
"""Process a staged document through the RAG pipeline."""
|
|
try:
|
|
s3_key = request.s3_key
|
|
|
|
# Verify object exists
|
|
if not await s3_client.object_exists(s3_key):
|
|
raise HTTPException(
|
|
status_code=404,
|
|
detail=f'Document not found in S3: {s3_key}',
|
|
)
|
|
|
|
# Fetch content from S3
|
|
content_bytes, metadata = await s3_client.get_object(s3_key)
|
|
|
|
# Extract doc_id from s3_key if not provided
|
|
# s3_key format: staging/{workspace}/{doc_id}/{filename}
|
|
doc_id = request.doc_id
|
|
if not doc_id:
|
|
parts = s3_key.split('/')
|
|
doc_id = parts[2] if len(parts) >= 3 else compute_mdhash_id(content_bytes, prefix='doc_')
|
|
|
|
# Determine content type and decode appropriately
|
|
content_type = metadata.get('content_type', 'application/octet-stream')
|
|
s3_url = s3_client.get_s3_url(s3_key)
|
|
|
|
# For text-based content, decode to string
|
|
if content_type.startswith('text/') or content_type in (
|
|
'application/json',
|
|
'application/xml',
|
|
'application/javascript',
|
|
):
|
|
try:
|
|
text_content = content_bytes.decode('utf-8')
|
|
except UnicodeDecodeError:
|
|
text_content = content_bytes.decode('latin-1')
|
|
else:
|
|
# For binary content (PDF, Word, etc.), we need document parsing
|
|
# For now, attempt UTF-8 decode or fail gracefully
|
|
try:
|
|
text_content = content_bytes.decode('utf-8')
|
|
except UnicodeDecodeError:
|
|
raise HTTPException(
|
|
status_code=400,
|
|
detail=f'Cannot process binary content type: {content_type}. '
|
|
'Document parsing for PDF/Word not yet implemented.',
|
|
) from None
|
|
|
|
if not text_content.strip():
|
|
raise HTTPException(
|
|
status_code=400,
|
|
detail='Document content is empty after decoding',
|
|
)
|
|
|
|
# Process through RAG pipeline
|
|
# Use s3_url as file_path for citation reference
|
|
logger.info(f'Processing S3 document: {s3_key} (doc_id: {doc_id})')
|
|
|
|
track_id = await rag.ainsert(
|
|
input=text_content,
|
|
ids=doc_id,
|
|
file_paths=s3_url,
|
|
)
|
|
|
|
# Move to archive if requested
|
|
archive_key = None
|
|
if request.archive_after_processing:
|
|
try:
|
|
archive_key = await s3_client.move_to_archive(s3_key)
|
|
logger.info(f'Moved to archive: {s3_key} -> {archive_key}')
|
|
|
|
# Update database chunks with archive s3_key
|
|
archive_url = s3_client.get_s3_url(archive_key)
|
|
updated_count = await cast(PGKVStorage, rag.text_chunks).update_s3_key_by_doc_id(
|
|
full_doc_id=doc_id,
|
|
s3_key=archive_key,
|
|
archive_url=archive_url,
|
|
)
|
|
logger.info(f'Updated {updated_count} chunks with archive s3_key: {archive_key}')
|
|
|
|
# Update doc_status with archive s3_key
|
|
await cast(PGDocStatusStorage, rag.doc_status).update_s3_key(doc_id, archive_key)
|
|
logger.info(f'Updated doc_status with archive s3_key: {archive_key}')
|
|
except Exception as e:
|
|
logger.warning(f'Failed to archive document: {e}')
|
|
# Don't fail the request, processing succeeded
|
|
|
|
return ProcessS3Response(
|
|
status='processing_complete',
|
|
track_id=track_id,
|
|
doc_id=doc_id,
|
|
s3_key=s3_key,
|
|
archive_key=archive_key,
|
|
message='Document processed and stored in RAG pipeline',
|
|
)
|
|
|
|
except HTTPException:
|
|
raise
|
|
except Exception as e:
|
|
logger.error(f'Failed to process S3 document: {e}')
|
|
raise HTTPException(
|
|
status_code=500,
|
|
detail=f'Failed to process S3 document: {e}',
|
|
) from e
|
|
|
|
return router
|