LightRAG/lightrag/api/routers/upload_routes.py

"""
Upload routes for S3/RustFS document staging.

This module provides endpoints for:
- Uploading documents to S3 staging
- Listing staged documents
- Getting presigned URLs
"""

import mimetypes
from typing import Annotated, Any, ClassVar, cast

from fastapi import (
    APIRouter,
    Depends,
    File,
    Form,
    HTTPException,
    UploadFile,
)
from pydantic import BaseModel, Field

from lightrag import LightRAG
from lightrag.api.utils_api import get_combined_auth_dependency
from lightrag.kg.postgres_impl import PGDocStatusStorage, PGKVStorage
from lightrag.storage.s3_client import S3Client
from lightrag.utils import compute_mdhash_id, logger


class UploadResponse(BaseModel):
    """Response model for document upload."""

    status: str = Field(description='Upload status')
    doc_id: str = Field(description='Document ID')
    s3_key: str = Field(description='S3 object key')
    s3_url: str = Field(description='S3 URL (s3://bucket/key)')
    message: str | None = Field(default=None, description='Additional message')

    class Config:
        json_schema_extra: ClassVar[dict[str, Any]] = {
            'example': {
                'status': 'uploaded',
                'doc_id': 'doc_abc123',
                's3_key': 'staging/default/doc_abc123/report.pdf',
                's3_url': 's3://lightrag/staging/default/doc_abc123/report.pdf',
                'message': 'Document staged for processing',
            }
        }


class StagedDocument(BaseModel):
    """Model for a staged document."""

    key: str = Field(description='S3 object key')
    size: int = Field(description='File size in bytes')
    last_modified: str = Field(description='Last modified timestamp')


class ListStagedResponse(BaseModel):
    """Response model for listing staged documents."""

    workspace: str = Field(description='Workspace name')
    documents: list[StagedDocument] = Field(description='List of staged documents')
    count: int = Field(description='Number of documents')


class PresignedUrlResponse(BaseModel):
    """Response model for presigned URL."""

    s3_key: str = Field(description='S3 object key')
    presigned_url: str = Field(description='Presigned URL for direct access')
    expiry_seconds: int = Field(description='URL expiry time in seconds')


class ProcessS3Request(BaseModel):
    """Request model for processing a document from S3 staging."""

    s3_key: str = Field(description='S3 key of the staged document')
    doc_id: str | None = Field(
        default=None,
        description='Document ID (extracted from s3_key if not provided)',
    )
    archive_after_processing: bool = Field(
        default=True,
        description='Move document to archive after successful processing',
    )

    class Config:
        json_schema_extra: ClassVar[dict[str, Any]] = {
            'example': {
                's3_key': 'staging/default/doc_abc123/report.pdf',
                'doc_id': 'doc_abc123',
                'archive_after_processing': True,
            }
        }


class ProcessS3Response(BaseModel):
    """Response model for S3 document processing."""

    status: str = Field(description='Processing status')
    track_id: str = Field(description='Track ID for monitoring processing progress')
    doc_id: str = Field(description='Document ID')
    s3_key: str = Field(description='Original S3 key')
    archive_key: str | None = Field(default=None, description='Archive S3 key (if archived)')
    message: str | None = Field(default=None, description='Additional message')

    class Config:
        json_schema_extra: ClassVar[dict[str, Any]] = {
            'example': {
                'status': 'processing_started',
                'track_id': 'insert_20250101_120000_abc123',
                'doc_id': 'doc_abc123',
                's3_key': 'staging/default/doc_abc123/report.pdf',
                'archive_key': 'archive/default/doc_abc123/report.pdf',
                'message': 'Document processing started',
            }
        }


def create_upload_routes(
    rag: LightRAG,
    s3_client: S3Client,
    api_key: str | None = None,
) -> APIRouter:
    """
    Create upload routes for S3 document staging.

    Args:
        rag: LightRAG instance
        s3_client: Initialized S3Client instance
        api_key: Optional API key for authentication

    Returns:
        FastAPI router with upload endpoints
    """
    router = APIRouter(
        prefix='/upload',
        tags=['upload'],
    )

    optional_api_key = get_combined_auth_dependency(api_key)

    @router.post(
        '',
        response_model=UploadResponse,
        summary='Upload document to S3 staging',
        description="""
        Upload a document to S3/RustFS staging area.

        The document will be staged at: s3://bucket/staging/{workspace}/{doc_id}/{filename}

        After upload, the document can be processed by calling the standard document
        processing endpoints, which will:
        1. Fetch the document from S3 staging
        2. Process it through the RAG pipeline
        3. Move it to S3 archive
        4. Store processed data in PostgreSQL
        """,
    )
    async def upload_document(
        file: Annotated[UploadFile, File(description='Document file to upload')],
        workspace: Annotated[str, Form(description='Workspace name')] = 'default',
        doc_id: Annotated[str | None, Form(description='Optional document ID (auto-generated if not provided)')] = None,
        _: Annotated[bool, Depends(optional_api_key)] = True,
    ) -> UploadResponse:
        """Upload a document to S3 staging."""
        try:
            # Read file content
            content = await file.read()

            if not content:
                raise HTTPException(status_code=400, detail='Empty file')

            # Generate doc_id if not provided
            if not doc_id:
                doc_id = compute_mdhash_id(content, prefix='doc_')

            # Determine content type
            final_content_type = file.content_type
            if not final_content_type:
                guessed_type, encoding = mimetypes.guess_type(file.filename or '')
                final_content_type = guessed_type or 'application/octet-stream'

            # Upload to S3 staging
            s3_key = await s3_client.upload_to_staging(
                workspace=workspace,
                doc_id=doc_id,
                content=content,
                filename=file.filename or f'{doc_id}.bin',
                content_type=final_content_type,
                metadata={
                    'original_size': str(len(content)),
                    'content_type': final_content_type,
                },
            )

            s3_url = s3_client.get_s3_url(s3_key)

            logger.info(f'Document uploaded to staging: {s3_key}')

            return UploadResponse(
                status='uploaded',
                doc_id=doc_id,
                s3_key=s3_key,
                s3_url=s3_url,
                message='Document staged for processing',
            )

        except HTTPException:
            raise
        except Exception as e:
            logger.error(f'Upload failed: {e}')
            raise HTTPException(status_code=500, detail=f'Upload failed: {e}') from e

    @router.get(
        '/staged',
        response_model=ListStagedResponse,
        summary='List staged documents',
        description='List all documents in the staging area for a workspace.',
    )
    async def list_staged(
        workspace: str = 'default',
        _: Annotated[bool, Depends(optional_api_key)] = True,
    ) -> ListStagedResponse:
        """List documents in staging."""
        try:
            objects = await s3_client.list_staging(workspace)

            documents = [
                StagedDocument(
                    key=obj['key'],
                    size=obj['size'],
                    last_modified=obj['last_modified'],
                )
                for obj in objects
            ]

            return ListStagedResponse(
                workspace=workspace,
                documents=documents,
                count=len(documents),
            )

        except Exception as e:
            logger.error(f'Failed to list staged documents: {e}')
            raise HTTPException(status_code=500, detail=f'Failed to list staged documents: {e}') from e

    @router.get(
        '/presigned-url',
        response_model=PresignedUrlResponse,
        summary='Get presigned URL',
        description='Generate a presigned URL for direct access to a document in S3.',
    )
    async def get_presigned_url(
        s3_key: str,
        expiry: int = 3600,
        _: Annotated[bool, Depends(optional_api_key)] = True,
    ) -> PresignedUrlResponse:
        """Get presigned URL for a document."""
        try:
            # Verify object exists
            if not await s3_client.object_exists(s3_key):
                raise HTTPException(status_code=404, detail='Object not found')

            url = await s3_client.get_presigned_url(s3_key, expiry=expiry)

            return PresignedUrlResponse(
                s3_key=s3_key,
                presigned_url=url,
                expiry_seconds=expiry,
            )

        except HTTPException:
            raise
        except Exception as e:
            logger.error(f'Failed to generate presigned URL: {e}')
            raise HTTPException(status_code=500, detail=f'Failed to generate presigned URL: {e}') from e

    @router.delete(
        '/staged/{doc_id}',
        summary='Delete staged document',
        description='Delete a document from the staging area.',
    )
    async def delete_staged(
        doc_id: str,
        workspace: str = 'default',
        _: Annotated[bool, Depends(optional_api_key)] = True,
    ) -> dict[str, str]:
        """Delete a staged document."""
        try:
            # List objects with this doc_id prefix
            prefix = f'staging/{workspace}/{doc_id}/'
            objects = await s3_client.list_staging(workspace)

            # Filter to this doc_id
            to_delete = [obj['key'] for obj in objects if obj['key'].startswith(prefix)]

            if not to_delete:
                raise HTTPException(status_code=404, detail='Document not found in staging')

            # Delete each object
            for key in to_delete:
                await s3_client.delete_object(key)

            return {
                'status': 'deleted',
                'doc_id': doc_id,
                'deleted_count': str(len(to_delete)),
            }

        except HTTPException:
            raise
        except Exception as e:
            logger.error(f'Failed to delete staged document: {e}')
            raise HTTPException(status_code=500, detail=f'Failed to delete staged document: {e}') from e

    @router.post(
        '/process',
        response_model=ProcessS3Response,
        summary='Process document from S3 staging',
        description="""
        Fetch a document from S3 staging and process it through the RAG pipeline.

        This endpoint:
        1. Fetches the document content from S3 staging
        2. Processes it through the RAG pipeline (chunking, entity extraction, embedding)
        3. Stores processed data in PostgreSQL with s3_key reference
        4. Optionally moves the document from staging to archive

        The s3_key should be the full key returned from the upload endpoint,
        e.g., "staging/default/doc_abc123/report.pdf"
        """,
    )
    async def process_from_s3(
        request: ProcessS3Request,
        _: Annotated[bool, Depends(optional_api_key)] = True,
    ) -> ProcessS3Response:
        """Process a staged document through the RAG pipeline."""
        try:
            s3_key = request.s3_key

            # Verify object exists
            if not await s3_client.object_exists(s3_key):
                raise HTTPException(
                    status_code=404,
                    detail=f'Document not found in S3: {s3_key}',
                )

            # Fetch content from S3
            content_bytes, metadata = await s3_client.get_object(s3_key)

            # Extract doc_id from s3_key if not provided
            # s3_key format: staging/{workspace}/{doc_id}/{filename}
            doc_id = request.doc_id
            if not doc_id:
                parts = s3_key.split('/')
                doc_id = parts[2] if len(parts) >= 3 else compute_mdhash_id(content_bytes, prefix='doc_')

            # Determine content type and decode appropriately
            content_type = metadata.get('content_type', 'application/octet-stream')
            s3_url = s3_client.get_s3_url(s3_key)

            # For text-based content, decode to string
            if content_type.startswith('text/') or content_type in (
                'application/json',
                'application/xml',
                'application/javascript',
            ):
                try:
                    text_content = content_bytes.decode('utf-8')
                except UnicodeDecodeError:
                    text_content = content_bytes.decode('latin-1')
            else:
                # For binary content (PDF, Word, etc.), we need document parsing
                # For now, attempt UTF-8 decode or fail gracefully
                try:
                    text_content = content_bytes.decode('utf-8')
                except UnicodeDecodeError:
                    raise HTTPException(
                        status_code=400,
                        detail=f'Cannot process binary content type: {content_type}. '
                        'Document parsing for PDF/Word not yet implemented.',
                    ) from None

            if not text_content.strip():
                raise HTTPException(
                    status_code=400,
                    detail='Document content is empty after decoding',
                )

            # Process through RAG pipeline
            # Use s3_url as file_path for citation reference
            logger.info(f'Processing S3 document: {s3_key} (doc_id: {doc_id})')

            track_id = await rag.ainsert(
                input=text_content,
                ids=doc_id,
                file_paths=s3_url,
            )

            # Move to archive if requested
            archive_key = None
            if request.archive_after_processing:
                try:
                    archive_key = await s3_client.move_to_archive(s3_key)
                    logger.info(f'Moved to archive: {s3_key} -> {archive_key}')

                    # Update database chunks with archive s3_key
                    archive_url = s3_client.get_s3_url(archive_key)
                    updated_count = await cast(PGKVStorage, rag.text_chunks).update_s3_key_by_doc_id(
                        full_doc_id=doc_id,
                        s3_key=archive_key,
                        archive_url=archive_url,
                    )
                    logger.info(f'Updated {updated_count} chunks with archive s3_key: {archive_key}')

                    # Update doc_status with archive s3_key
                    await cast(PGDocStatusStorage, rag.doc_status).update_s3_key(doc_id, archive_key)
                    logger.info(f'Updated doc_status with archive s3_key: {archive_key}')
                except Exception as e:
                    logger.warning(f'Failed to archive document: {e}')
                    # Don't fail the request, processing succeeded

            return ProcessS3Response(
                status='processing_complete',
                track_id=track_id,
                doc_id=doc_id,
                s3_key=s3_key,
                archive_key=archive_key,
                message='Document processed and stored in RAG pipeline',
            )

        except HTTPException:
            raise
        except Exception as e:
            logger.error(f'Failed to process S3 document: {e}')
            raise HTTPException(
                status_code=500,
                detail=f'Failed to process S3 document: {e}',
            ) from e

    return router