Remove legacy storage implementations and deprecated examples: - Delete FAISS, JSON, Memgraph, Milvus, MongoDB, Nano Vector DB, Neo4j, NetworkX, Qdrant, Redis storage backends - Remove Kubernetes deployment manifests and installation scripts - Delete unofficial examples for deprecated backends and offline deployment docs Streamline core infrastructure: - Consolidate storage layer to PostgreSQL-only implementation - Add full-text search caching with FTS cache module - Implement metrics collection and monitoring pipeline - Add explain and metrics API routes Modernize frontend and tooling: - Switch web UI to Bun with bun.lock, remove npm and pnpm lockfiles - Update Dockerfile for PostgreSQL-only deployment - Add Makefile for common development tasks - Update environment and configuration examples Enhance evaluation and testing capabilities: - Add prompt optimization with DSPy and auto-tuning - Implement ground truth regeneration and variant testing - Add prompt debugging and response comparison utilities - Expand test coverage with new integration scenarios Simplify dependencies and configuration: - Remove offline-specific requirement files - Update pyproject.toml with streamlined dependencies - Add Python version pinning with .python-version - Create project guidelines in CLAUDE.md and AGENTS.md
443 lines
16 KiB
Python
443 lines
16 KiB
Python
"""
|
|
Upload routes for S3/RustFS document staging.
|
|
|
|
This module provides endpoints for:
|
|
- Uploading documents to S3 staging
|
|
- Listing staged documents
|
|
- Getting presigned URLs
|
|
"""
|
|
|
|
import mimetypes
|
|
from typing import Annotated, Any, ClassVar
|
|
|
|
from fastapi import (
|
|
APIRouter,
|
|
Depends,
|
|
File,
|
|
Form,
|
|
HTTPException,
|
|
UploadFile,
|
|
)
|
|
from pydantic import BaseModel, Field
|
|
|
|
from lightrag import LightRAG
|
|
from lightrag.api.utils_api import get_combined_auth_dependency
|
|
from lightrag.kg.postgres_impl import PGDocStatusStorage, PGKVStorage
|
|
from lightrag.storage.s3_client import S3Client
|
|
from lightrag.utils import compute_mdhash_id, logger
|
|
|
|
|
|
class UploadResponse(BaseModel):
|
|
"""Response model for document upload."""
|
|
|
|
status: str = Field(description='Upload status')
|
|
doc_id: str = Field(description='Document ID')
|
|
s3_key: str = Field(description='S3 object key')
|
|
s3_url: str = Field(description='S3 URL (s3://bucket/key)')
|
|
message: str | None = Field(default=None, description='Additional message')
|
|
|
|
class Config:
|
|
json_schema_extra: ClassVar[dict[str, Any]] = {
|
|
'example': {
|
|
'status': 'uploaded',
|
|
'doc_id': 'doc_abc123',
|
|
's3_key': 'staging/default/doc_abc123/report.pdf',
|
|
's3_url': 's3://lightrag/staging/default/doc_abc123/report.pdf',
|
|
'message': 'Document staged for processing',
|
|
}
|
|
}
|
|
|
|
|
|
class StagedDocument(BaseModel):
|
|
"""Model for a staged document."""
|
|
|
|
key: str = Field(description='S3 object key')
|
|
size: int = Field(description='File size in bytes')
|
|
last_modified: str = Field(description='Last modified timestamp')
|
|
|
|
|
|
class ListStagedResponse(BaseModel):
|
|
"""Response model for listing staged documents."""
|
|
|
|
workspace: str = Field(description='Workspace name')
|
|
documents: list[StagedDocument] = Field(description='List of staged documents')
|
|
count: int = Field(description='Number of documents')
|
|
|
|
|
|
class PresignedUrlResponse(BaseModel):
|
|
"""Response model for presigned URL."""
|
|
|
|
s3_key: str = Field(description='S3 object key')
|
|
presigned_url: str = Field(description='Presigned URL for direct access')
|
|
expiry_seconds: int = Field(description='URL expiry time in seconds')
|
|
|
|
|
|
class ProcessS3Request(BaseModel):
|
|
"""Request model for processing a document from S3 staging."""
|
|
|
|
s3_key: str = Field(description='S3 key of the staged document')
|
|
doc_id: str | None = Field(
|
|
default=None,
|
|
description='Document ID (extracted from s3_key if not provided)',
|
|
)
|
|
archive_after_processing: bool = Field(
|
|
default=True,
|
|
description='Move document to archive after successful processing',
|
|
)
|
|
|
|
class Config:
|
|
json_schema_extra: ClassVar[dict[str, Any]] = {
|
|
'example': {
|
|
's3_key': 'staging/default/doc_abc123/report.pdf',
|
|
'doc_id': 'doc_abc123',
|
|
'archive_after_processing': True,
|
|
}
|
|
}
|
|
|
|
|
|
class ProcessS3Response(BaseModel):
|
|
"""Response model for S3 document processing."""
|
|
|
|
status: str = Field(description='Processing status')
|
|
track_id: str = Field(description='Track ID for monitoring processing progress')
|
|
doc_id: str = Field(description='Document ID')
|
|
s3_key: str = Field(description='Original S3 key')
|
|
archive_key: str | None = Field(default=None, description='Archive S3 key (if archived)')
|
|
message: str | None = Field(default=None, description='Additional message')
|
|
|
|
class Config:
|
|
json_schema_extra: ClassVar[dict[str, Any]] = {
|
|
'example': {
|
|
'status': 'processing_started',
|
|
'track_id': 'insert_20250101_120000_abc123',
|
|
'doc_id': 'doc_abc123',
|
|
's3_key': 'staging/default/doc_abc123/report.pdf',
|
|
'archive_key': 'archive/default/doc_abc123/report.pdf',
|
|
'message': 'Document processing started',
|
|
}
|
|
}
|
|
|
|
|
|
def create_upload_routes(
|
|
rag: LightRAG,
|
|
s3_client: S3Client,
|
|
api_key: str | None = None,
|
|
) -> APIRouter:
|
|
"""
|
|
Create upload routes for S3 document staging.
|
|
|
|
Args:
|
|
rag: LightRAG instance
|
|
s3_client: Initialized S3Client instance
|
|
api_key: Optional API key for authentication
|
|
|
|
Returns:
|
|
FastAPI router with upload endpoints
|
|
"""
|
|
router = APIRouter(
|
|
prefix='/upload',
|
|
tags=['upload'],
|
|
)
|
|
|
|
optional_api_key = get_combined_auth_dependency(api_key)
|
|
|
|
@router.post(
|
|
'',
|
|
response_model=UploadResponse,
|
|
summary='Upload document to S3 staging',
|
|
description="""
|
|
Upload a document to S3/RustFS staging area.
|
|
|
|
The document will be staged at: s3://bucket/staging/{workspace}/{doc_id}/{filename}
|
|
|
|
After upload, the document can be processed by calling the standard document
|
|
processing endpoints, which will:
|
|
1. Fetch the document from S3 staging
|
|
2. Process it through the RAG pipeline
|
|
3. Move it to S3 archive
|
|
4. Store processed data in PostgreSQL
|
|
""",
|
|
)
|
|
async def upload_document(
|
|
file: Annotated[UploadFile, File(description='Document file to upload')],
|
|
workspace: Annotated[str, Form(description='Workspace name')] = 'default',
|
|
doc_id: Annotated[str | None, Form(description='Optional document ID (auto-generated if not provided)')] = None,
|
|
_: Annotated[bool, Depends(optional_api_key)] = True,
|
|
) -> UploadResponse:
|
|
"""Upload a document to S3 staging."""
|
|
try:
|
|
# Read file content
|
|
content = await file.read()
|
|
|
|
if not content:
|
|
raise HTTPException(status_code=400, detail='Empty file')
|
|
|
|
# Generate doc_id if not provided
|
|
if not doc_id:
|
|
doc_id = compute_mdhash_id(content, prefix='doc_')
|
|
|
|
# Determine content type
|
|
final_content_type = file.content_type
|
|
if not final_content_type:
|
|
guessed_type, _encoding = mimetypes.guess_type(file.filename or '')
|
|
final_content_type = guessed_type or 'application/octet-stream'
|
|
|
|
# Upload to S3 staging
|
|
s3_key = await s3_client.upload_to_staging(
|
|
workspace=workspace,
|
|
doc_id=doc_id,
|
|
content=content,
|
|
filename=file.filename or f'{doc_id}.bin',
|
|
content_type=final_content_type,
|
|
metadata={
|
|
'original_size': str(len(content)),
|
|
'content_type': final_content_type,
|
|
},
|
|
)
|
|
|
|
s3_url = s3_client.get_s3_url(s3_key)
|
|
|
|
logger.info(f'Document uploaded to staging: {s3_key}')
|
|
|
|
return UploadResponse(
|
|
status='uploaded',
|
|
doc_id=doc_id,
|
|
s3_key=s3_key,
|
|
s3_url=s3_url,
|
|
message='Document staged for processing',
|
|
)
|
|
|
|
except HTTPException:
|
|
raise
|
|
except Exception as e:
|
|
logger.error(f'Upload failed: {e}')
|
|
raise HTTPException(status_code=500, detail=f'Upload failed: {e}') from e
|
|
|
|
@router.get(
|
|
'/staged',
|
|
response_model=ListStagedResponse,
|
|
summary='List staged documents',
|
|
description='List all documents in the staging area for a workspace.',
|
|
)
|
|
async def list_staged(
|
|
workspace: str = 'default',
|
|
_: Annotated[bool, Depends(optional_api_key)] = True,
|
|
) -> ListStagedResponse:
|
|
"""List documents in staging."""
|
|
try:
|
|
objects = await s3_client.list_staging(workspace)
|
|
|
|
documents = [
|
|
StagedDocument(
|
|
key=obj['key'],
|
|
size=obj['size'],
|
|
last_modified=obj['last_modified'],
|
|
)
|
|
for obj in objects
|
|
]
|
|
|
|
return ListStagedResponse(
|
|
workspace=workspace,
|
|
documents=documents,
|
|
count=len(documents),
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f'Failed to list staged documents: {e}')
|
|
raise HTTPException(status_code=500, detail=f'Failed to list staged documents: {e}') from e
|
|
|
|
@router.get(
|
|
'/presigned-url',
|
|
response_model=PresignedUrlResponse,
|
|
summary='Get presigned URL',
|
|
description='Generate a presigned URL for direct access to a document in S3.',
|
|
)
|
|
async def get_presigned_url(
|
|
s3_key: str,
|
|
expiry: int = 3600,
|
|
_: Annotated[bool, Depends(optional_api_key)] = True,
|
|
) -> PresignedUrlResponse:
|
|
"""Get presigned URL for a document."""
|
|
try:
|
|
# Verify object exists
|
|
if not await s3_client.object_exists(s3_key):
|
|
raise HTTPException(status_code=404, detail='Object not found')
|
|
|
|
url = await s3_client.get_presigned_url(s3_key, expiry=expiry)
|
|
|
|
return PresignedUrlResponse(
|
|
s3_key=s3_key,
|
|
presigned_url=url,
|
|
expiry_seconds=expiry,
|
|
)
|
|
|
|
except HTTPException:
|
|
raise
|
|
except Exception as e:
|
|
logger.error(f'Failed to generate presigned URL: {e}')
|
|
raise HTTPException(status_code=500, detail=f'Failed to generate presigned URL: {e}') from e
|
|
|
|
@router.delete(
|
|
'/staged/{doc_id}',
|
|
summary='Delete staged document',
|
|
description='Delete a document from the staging area.',
|
|
)
|
|
async def delete_staged(
|
|
doc_id: str,
|
|
workspace: str = 'default',
|
|
_: Annotated[bool, Depends(optional_api_key)] = True,
|
|
) -> dict[str, str]:
|
|
"""Delete a staged document."""
|
|
try:
|
|
# List objects with this doc_id prefix
|
|
prefix = f'staging/{workspace}/{doc_id}/'
|
|
objects = await s3_client.list_staging(workspace)
|
|
|
|
# Filter to this doc_id
|
|
to_delete = [obj['key'] for obj in objects if obj['key'].startswith(prefix)]
|
|
|
|
if not to_delete:
|
|
raise HTTPException(status_code=404, detail='Document not found in staging')
|
|
|
|
# Delete each object
|
|
for key in to_delete:
|
|
await s3_client.delete_object(key)
|
|
|
|
return {
|
|
'status': 'deleted',
|
|
'doc_id': doc_id,
|
|
'deleted_count': str(len(to_delete)),
|
|
}
|
|
|
|
except HTTPException:
|
|
raise
|
|
except Exception as e:
|
|
logger.error(f'Failed to delete staged document: {e}')
|
|
raise HTTPException(status_code=500, detail=f'Failed to delete staged document: {e}') from e
|
|
|
|
@router.post(
|
|
'/process',
|
|
response_model=ProcessS3Response,
|
|
summary='Process document from S3 staging',
|
|
description="""
|
|
Fetch a document from S3 staging and process it through the RAG pipeline.
|
|
|
|
This endpoint:
|
|
1. Fetches the document content from S3 staging
|
|
2. Processes it through the RAG pipeline (chunking, entity extraction, embedding)
|
|
3. Stores processed data in PostgreSQL with s3_key reference
|
|
4. Optionally moves the document from staging to archive
|
|
|
|
The s3_key should be the full key returned from the upload endpoint,
|
|
e.g., "staging/default/doc_abc123/report.pdf"
|
|
""",
|
|
)
|
|
async def process_from_s3(
|
|
request: ProcessS3Request,
|
|
_: Annotated[bool, Depends(optional_api_key)] = True,
|
|
) -> ProcessS3Response:
|
|
"""Process a staged document through the RAG pipeline."""
|
|
try:
|
|
s3_key = request.s3_key
|
|
|
|
# Verify object exists
|
|
if not await s3_client.object_exists(s3_key):
|
|
raise HTTPException(
|
|
status_code=404,
|
|
detail=f'Document not found in S3: {s3_key}',
|
|
)
|
|
|
|
# Fetch content from S3
|
|
content_bytes, metadata = await s3_client.get_object(s3_key)
|
|
|
|
# Extract doc_id from s3_key if not provided
|
|
# s3_key format: staging/{workspace}/{doc_id}/{filename}
|
|
doc_id = request.doc_id
|
|
if not doc_id:
|
|
parts = s3_key.split('/')
|
|
doc_id = parts[2] if len(parts) >= 3 else compute_mdhash_id(content_bytes, prefix='doc_')
|
|
|
|
# Determine content type and decode appropriately
|
|
content_type = metadata.get('content_type', 'application/octet-stream')
|
|
s3_url = s3_client.get_s3_url(s3_key)
|
|
|
|
# For text-based content, decode to string
|
|
if content_type.startswith('text/') or content_type in (
|
|
'application/json',
|
|
'application/xml',
|
|
'application/javascript',
|
|
):
|
|
try:
|
|
text_content = content_bytes.decode('utf-8')
|
|
except UnicodeDecodeError:
|
|
text_content = content_bytes.decode('latin-1')
|
|
else:
|
|
# For binary content (PDF, Word, etc.), we need document parsing
|
|
# For now, attempt UTF-8 decode or fail gracefully
|
|
try:
|
|
text_content = content_bytes.decode('utf-8')
|
|
except UnicodeDecodeError:
|
|
raise HTTPException(
|
|
status_code=400,
|
|
detail=f'Cannot process binary content type: {content_type}. '
|
|
'Document parsing for PDF/Word not yet implemented.',
|
|
) from None
|
|
|
|
if not text_content.strip():
|
|
raise HTTPException(
|
|
status_code=400,
|
|
detail='Document content is empty after decoding',
|
|
)
|
|
|
|
# Process through RAG pipeline
|
|
# Use s3_url as file_path for citation reference
|
|
logger.info(f'Processing S3 document: {s3_key} (doc_id: {doc_id})')
|
|
|
|
track_id = await rag.ainsert(
|
|
input=text_content,
|
|
ids=doc_id,
|
|
file_paths=s3_url,
|
|
)
|
|
|
|
# Move to archive if requested
|
|
archive_key = None
|
|
if request.archive_after_processing:
|
|
try:
|
|
archive_key = await s3_client.move_to_archive(s3_key)
|
|
logger.info(f'Moved to archive: {s3_key} -> {archive_key}')
|
|
|
|
# Update database chunks with archive s3_key
|
|
archive_url = s3_client.get_s3_url(archive_key)
|
|
if isinstance(rag.text_chunks, PGKVStorage):
|
|
updated_count = await rag.text_chunks.update_s3_key_by_doc_id(
|
|
full_doc_id=doc_id,
|
|
s3_key=archive_key,
|
|
archive_url=archive_url,
|
|
)
|
|
logger.info(f'Updated {updated_count} chunks with archive s3_key: {archive_key}')
|
|
if isinstance(rag.doc_status, PGDocStatusStorage):
|
|
await rag.doc_status.update_s3_key(doc_id, archive_key)
|
|
logger.info(f'Updated doc_status with archive s3_key: {archive_key}')
|
|
except Exception as e:
|
|
logger.warning(f'Failed to archive document: {e}')
|
|
# Don't fail the request, processing succeeded
|
|
|
|
return ProcessS3Response(
|
|
status='processing_complete',
|
|
track_id=track_id,
|
|
doc_id=doc_id,
|
|
s3_key=s3_key,
|
|
archive_key=archive_key,
|
|
message='Document processed and stored in RAG pipeline',
|
|
)
|
|
|
|
except HTTPException:
|
|
raise
|
|
except Exception as e:
|
|
logger.error(f'Failed to process S3 document: {e}')
|
|
raise HTTPException(
|
|
status_code=500,
|
|
detail=f'Failed to process S3 document: {e}',
|
|
) from e
|
|
|
|
return router
|