LightRAG/lightrag/api/routers/upload_routes.py
clssck 59e89772de refactor: consolidate to PostgreSQL-only backend and modernize stack
Remove legacy storage implementations and deprecated examples:
- Delete FAISS, JSON, Memgraph, Milvus, MongoDB, Nano Vector DB, Neo4j, NetworkX, Qdrant, Redis storage backends
- Remove Kubernetes deployment manifests and installation scripts
- Delete unofficial examples for deprecated backends and offline deployment docs
Streamline core infrastructure:
- Consolidate storage layer to PostgreSQL-only implementation
- Add full-text search caching with FTS cache module
- Implement metrics collection and monitoring pipeline
- Add explain and metrics API routes
Modernize frontend and tooling:
- Switch web UI to Bun with bun.lock, remove npm and pnpm lockfiles
- Update Dockerfile for PostgreSQL-only deployment
- Add Makefile for common development tasks
- Update environment and configuration examples
Enhance evaluation and testing capabilities:
- Add prompt optimization with DSPy and auto-tuning
- Implement ground truth regeneration and variant testing
- Add prompt debugging and response comparison utilities
- Expand test coverage with new integration scenarios
Simplify dependencies and configuration:
- Remove offline-specific requirement files
- Update pyproject.toml with streamlined dependencies
- Add Python version pinning with .python-version
- Create project guidelines in CLAUDE.md and AGENTS.md
2025-12-12 16:28:49 +01:00

443 lines
16 KiB
Python

"""
Upload routes for S3/RustFS document staging.
This module provides endpoints for:
- Uploading documents to S3 staging
- Listing staged documents
- Getting presigned URLs
"""
import mimetypes
from typing import Annotated, Any, ClassVar
from fastapi import (
APIRouter,
Depends,
File,
Form,
HTTPException,
UploadFile,
)
from pydantic import BaseModel, Field
from lightrag import LightRAG
from lightrag.api.utils_api import get_combined_auth_dependency
from lightrag.kg.postgres_impl import PGDocStatusStorage, PGKVStorage
from lightrag.storage.s3_client import S3Client
from lightrag.utils import compute_mdhash_id, logger
class UploadResponse(BaseModel):
"""Response model for document upload."""
status: str = Field(description='Upload status')
doc_id: str = Field(description='Document ID')
s3_key: str = Field(description='S3 object key')
s3_url: str = Field(description='S3 URL (s3://bucket/key)')
message: str | None = Field(default=None, description='Additional message')
class Config:
json_schema_extra: ClassVar[dict[str, Any]] = {
'example': {
'status': 'uploaded',
'doc_id': 'doc_abc123',
's3_key': 'staging/default/doc_abc123/report.pdf',
's3_url': 's3://lightrag/staging/default/doc_abc123/report.pdf',
'message': 'Document staged for processing',
}
}
class StagedDocument(BaseModel):
"""Model for a staged document."""
key: str = Field(description='S3 object key')
size: int = Field(description='File size in bytes')
last_modified: str = Field(description='Last modified timestamp')
class ListStagedResponse(BaseModel):
"""Response model for listing staged documents."""
workspace: str = Field(description='Workspace name')
documents: list[StagedDocument] = Field(description='List of staged documents')
count: int = Field(description='Number of documents')
class PresignedUrlResponse(BaseModel):
"""Response model for presigned URL."""
s3_key: str = Field(description='S3 object key')
presigned_url: str = Field(description='Presigned URL for direct access')
expiry_seconds: int = Field(description='URL expiry time in seconds')
class ProcessS3Request(BaseModel):
"""Request model for processing a document from S3 staging."""
s3_key: str = Field(description='S3 key of the staged document')
doc_id: str | None = Field(
default=None,
description='Document ID (extracted from s3_key if not provided)',
)
archive_after_processing: bool = Field(
default=True,
description='Move document to archive after successful processing',
)
class Config:
json_schema_extra: ClassVar[dict[str, Any]] = {
'example': {
's3_key': 'staging/default/doc_abc123/report.pdf',
'doc_id': 'doc_abc123',
'archive_after_processing': True,
}
}
class ProcessS3Response(BaseModel):
"""Response model for S3 document processing."""
status: str = Field(description='Processing status')
track_id: str = Field(description='Track ID for monitoring processing progress')
doc_id: str = Field(description='Document ID')
s3_key: str = Field(description='Original S3 key')
archive_key: str | None = Field(default=None, description='Archive S3 key (if archived)')
message: str | None = Field(default=None, description='Additional message')
class Config:
json_schema_extra: ClassVar[dict[str, Any]] = {
'example': {
'status': 'processing_started',
'track_id': 'insert_20250101_120000_abc123',
'doc_id': 'doc_abc123',
's3_key': 'staging/default/doc_abc123/report.pdf',
'archive_key': 'archive/default/doc_abc123/report.pdf',
'message': 'Document processing started',
}
}
def create_upload_routes(
rag: LightRAG,
s3_client: S3Client,
api_key: str | None = None,
) -> APIRouter:
"""
Create upload routes for S3 document staging.
Args:
rag: LightRAG instance
s3_client: Initialized S3Client instance
api_key: Optional API key for authentication
Returns:
FastAPI router with upload endpoints
"""
router = APIRouter(
prefix='/upload',
tags=['upload'],
)
optional_api_key = get_combined_auth_dependency(api_key)
@router.post(
'',
response_model=UploadResponse,
summary='Upload document to S3 staging',
description="""
Upload a document to S3/RustFS staging area.
The document will be staged at: s3://bucket/staging/{workspace}/{doc_id}/{filename}
After upload, the document can be processed by calling the standard document
processing endpoints, which will:
1. Fetch the document from S3 staging
2. Process it through the RAG pipeline
3. Move it to S3 archive
4. Store processed data in PostgreSQL
""",
)
async def upload_document(
file: Annotated[UploadFile, File(description='Document file to upload')],
workspace: Annotated[str, Form(description='Workspace name')] = 'default',
doc_id: Annotated[str | None, Form(description='Optional document ID (auto-generated if not provided)')] = None,
_: Annotated[bool, Depends(optional_api_key)] = True,
) -> UploadResponse:
"""Upload a document to S3 staging."""
try:
# Read file content
content = await file.read()
if not content:
raise HTTPException(status_code=400, detail='Empty file')
# Generate doc_id if not provided
if not doc_id:
doc_id = compute_mdhash_id(content, prefix='doc_')
# Determine content type
final_content_type = file.content_type
if not final_content_type:
guessed_type, _encoding = mimetypes.guess_type(file.filename or '')
final_content_type = guessed_type or 'application/octet-stream'
# Upload to S3 staging
s3_key = await s3_client.upload_to_staging(
workspace=workspace,
doc_id=doc_id,
content=content,
filename=file.filename or f'{doc_id}.bin',
content_type=final_content_type,
metadata={
'original_size': str(len(content)),
'content_type': final_content_type,
},
)
s3_url = s3_client.get_s3_url(s3_key)
logger.info(f'Document uploaded to staging: {s3_key}')
return UploadResponse(
status='uploaded',
doc_id=doc_id,
s3_key=s3_key,
s3_url=s3_url,
message='Document staged for processing',
)
except HTTPException:
raise
except Exception as e:
logger.error(f'Upload failed: {e}')
raise HTTPException(status_code=500, detail=f'Upload failed: {e}') from e
@router.get(
'/staged',
response_model=ListStagedResponse,
summary='List staged documents',
description='List all documents in the staging area for a workspace.',
)
async def list_staged(
workspace: str = 'default',
_: Annotated[bool, Depends(optional_api_key)] = True,
) -> ListStagedResponse:
"""List documents in staging."""
try:
objects = await s3_client.list_staging(workspace)
documents = [
StagedDocument(
key=obj['key'],
size=obj['size'],
last_modified=obj['last_modified'],
)
for obj in objects
]
return ListStagedResponse(
workspace=workspace,
documents=documents,
count=len(documents),
)
except Exception as e:
logger.error(f'Failed to list staged documents: {e}')
raise HTTPException(status_code=500, detail=f'Failed to list staged documents: {e}') from e
@router.get(
'/presigned-url',
response_model=PresignedUrlResponse,
summary='Get presigned URL',
description='Generate a presigned URL for direct access to a document in S3.',
)
async def get_presigned_url(
s3_key: str,
expiry: int = 3600,
_: Annotated[bool, Depends(optional_api_key)] = True,
) -> PresignedUrlResponse:
"""Get presigned URL for a document."""
try:
# Verify object exists
if not await s3_client.object_exists(s3_key):
raise HTTPException(status_code=404, detail='Object not found')
url = await s3_client.get_presigned_url(s3_key, expiry=expiry)
return PresignedUrlResponse(
s3_key=s3_key,
presigned_url=url,
expiry_seconds=expiry,
)
except HTTPException:
raise
except Exception as e:
logger.error(f'Failed to generate presigned URL: {e}')
raise HTTPException(status_code=500, detail=f'Failed to generate presigned URL: {e}') from e
@router.delete(
'/staged/{doc_id}',
summary='Delete staged document',
description='Delete a document from the staging area.',
)
async def delete_staged(
doc_id: str,
workspace: str = 'default',
_: Annotated[bool, Depends(optional_api_key)] = True,
) -> dict[str, str]:
"""Delete a staged document."""
try:
# List objects with this doc_id prefix
prefix = f'staging/{workspace}/{doc_id}/'
objects = await s3_client.list_staging(workspace)
# Filter to this doc_id
to_delete = [obj['key'] for obj in objects if obj['key'].startswith(prefix)]
if not to_delete:
raise HTTPException(status_code=404, detail='Document not found in staging')
# Delete each object
for key in to_delete:
await s3_client.delete_object(key)
return {
'status': 'deleted',
'doc_id': doc_id,
'deleted_count': str(len(to_delete)),
}
except HTTPException:
raise
except Exception as e:
logger.error(f'Failed to delete staged document: {e}')
raise HTTPException(status_code=500, detail=f'Failed to delete staged document: {e}') from e
@router.post(
'/process',
response_model=ProcessS3Response,
summary='Process document from S3 staging',
description="""
Fetch a document from S3 staging and process it through the RAG pipeline.
This endpoint:
1. Fetches the document content from S3 staging
2. Processes it through the RAG pipeline (chunking, entity extraction, embedding)
3. Stores processed data in PostgreSQL with s3_key reference
4. Optionally moves the document from staging to archive
The s3_key should be the full key returned from the upload endpoint,
e.g., "staging/default/doc_abc123/report.pdf"
""",
)
async def process_from_s3(
request: ProcessS3Request,
_: Annotated[bool, Depends(optional_api_key)] = True,
) -> ProcessS3Response:
"""Process a staged document through the RAG pipeline."""
try:
s3_key = request.s3_key
# Verify object exists
if not await s3_client.object_exists(s3_key):
raise HTTPException(
status_code=404,
detail=f'Document not found in S3: {s3_key}',
)
# Fetch content from S3
content_bytes, metadata = await s3_client.get_object(s3_key)
# Extract doc_id from s3_key if not provided
# s3_key format: staging/{workspace}/{doc_id}/{filename}
doc_id = request.doc_id
if not doc_id:
parts = s3_key.split('/')
doc_id = parts[2] if len(parts) >= 3 else compute_mdhash_id(content_bytes, prefix='doc_')
# Determine content type and decode appropriately
content_type = metadata.get('content_type', 'application/octet-stream')
s3_url = s3_client.get_s3_url(s3_key)
# For text-based content, decode to string
if content_type.startswith('text/') or content_type in (
'application/json',
'application/xml',
'application/javascript',
):
try:
text_content = content_bytes.decode('utf-8')
except UnicodeDecodeError:
text_content = content_bytes.decode('latin-1')
else:
# For binary content (PDF, Word, etc.), we need document parsing
# For now, attempt UTF-8 decode or fail gracefully
try:
text_content = content_bytes.decode('utf-8')
except UnicodeDecodeError:
raise HTTPException(
status_code=400,
detail=f'Cannot process binary content type: {content_type}. '
'Document parsing for PDF/Word not yet implemented.',
) from None
if not text_content.strip():
raise HTTPException(
status_code=400,
detail='Document content is empty after decoding',
)
# Process through RAG pipeline
# Use s3_url as file_path for citation reference
logger.info(f'Processing S3 document: {s3_key} (doc_id: {doc_id})')
track_id = await rag.ainsert(
input=text_content,
ids=doc_id,
file_paths=s3_url,
)
# Move to archive if requested
archive_key = None
if request.archive_after_processing:
try:
archive_key = await s3_client.move_to_archive(s3_key)
logger.info(f'Moved to archive: {s3_key} -> {archive_key}')
# Update database chunks with archive s3_key
archive_url = s3_client.get_s3_url(archive_key)
if isinstance(rag.text_chunks, PGKVStorage):
updated_count = await rag.text_chunks.update_s3_key_by_doc_id(
full_doc_id=doc_id,
s3_key=archive_key,
archive_url=archive_url,
)
logger.info(f'Updated {updated_count} chunks with archive s3_key: {archive_key}')
if isinstance(rag.doc_status, PGDocStatusStorage):
await rag.doc_status.update_s3_key(doc_id, archive_key)
logger.info(f'Updated doc_status with archive s3_key: {archive_key}')
except Exception as e:
logger.warning(f'Failed to archive document: {e}')
# Don't fail the request, processing succeeded
return ProcessS3Response(
status='processing_complete',
track_id=track_id,
doc_id=doc_id,
s3_key=s3_key,
archive_key=archive_key,
message='Document processed and stored in RAG pipeline',
)
except HTTPException:
raise
except Exception as e:
logger.error(f'Failed to process S3 document: {e}')
raise HTTPException(
status_code=500,
detail=f'Failed to process S3 document: {e}',
) from e
return router