Add S3 storage client and API routes for document management: - Implement s3_routes.py with file upload, download, delete endpoints - Enhance s3_client.py with improved error handling and operations - Add S3 browser UI component with file viewing and management - Implement FileViewer and PDFViewer components for storage preview - Add Resizable and Sheet UI components for layout control Update backend infrastructure: - Add bulk operations and parameterized queries to postgres_impl.py - Enhance document routes with improved type hints - Update API server registration for new S3 routes - Refine upload routes and utility functions Modernize web UI: - Integrate S3 browser into main application layout - Update localization files for storage UI strings - Add storage settings to application configuration - Sync package dependencies and lock files Remove obsolete reproduction script: - Delete reproduce_citation.py (replaced by test suite) Update configuration: - Enhance pyrightconfig.json for stricter type checking
2968 lines
123 KiB
Python
2968 lines
123 KiB
Python
"""
|
|
This module contains all document-related routes for the LightRAG API.
|
|
"""
|
|
|
|
import asyncio
|
|
import shutil
|
|
import traceback
|
|
from datetime import datetime, timezone
|
|
from functools import lru_cache
|
|
from io import BytesIO
|
|
from pathlib import Path
|
|
from typing import Annotated, Any, ClassVar, Literal
|
|
|
|
import aiofiles
|
|
from fastapi import (
|
|
APIRouter,
|
|
BackgroundTasks,
|
|
Depends,
|
|
File,
|
|
HTTPException,
|
|
UploadFile,
|
|
)
|
|
from pydantic import BaseModel, Field, field_validator
|
|
|
|
from lightrag import LightRAG
|
|
from lightrag.api.config import global_args
|
|
from lightrag.api.utils_api import get_combined_auth_dependency
|
|
from lightrag.base import DeletionResult, DocProcessingStatus, DocStatus
|
|
from lightrag.utils import (
|
|
compute_mdhash_id,
|
|
generate_track_id,
|
|
get_pinyin_sort_key,
|
|
logger,
|
|
sanitize_text_for_encoding,
|
|
)
|
|
|
|
|
|
@lru_cache(maxsize=1)
|
|
def _is_docling_available() -> bool:
|
|
"""Check if docling is available (cached check).
|
|
|
|
This function uses lru_cache to avoid repeated import attempts.
|
|
The result is cached after the first call.
|
|
|
|
Returns:
|
|
bool: True if docling is available, False otherwise
|
|
"""
|
|
try:
|
|
import docling # noqa: F401 # type: ignore[import-not-found]
|
|
|
|
return True
|
|
except ImportError:
|
|
return False
|
|
|
|
|
|
# Function to format datetime to ISO format string with timezone information
|
|
def format_datetime(dt: Any) -> str | None:
|
|
"""Format datetime to ISO format string with timezone information
|
|
|
|
Args:
|
|
dt: Datetime object, string, or None
|
|
|
|
Returns:
|
|
ISO format string with timezone information, or None if input is None
|
|
"""
|
|
if dt is None:
|
|
return None
|
|
if isinstance(dt, str):
|
|
return dt
|
|
|
|
# Check if datetime object has timezone information
|
|
if isinstance(dt, datetime) and dt.tzinfo is None:
|
|
# If datetime object has no timezone info (naive datetime), add UTC timezone
|
|
dt = dt.replace(tzinfo=timezone.utc)
|
|
|
|
# Return ISO format string with timezone information
|
|
return dt.isoformat()
|
|
|
|
|
|
router = APIRouter(
|
|
prefix='/documents',
|
|
tags=['documents'],
|
|
)
|
|
|
|
# Temporary file prefix
|
|
temp_prefix = '__tmp__'
|
|
|
|
|
|
def sanitize_filename(filename: str, input_dir: Path) -> str:
|
|
"""
|
|
Sanitize uploaded filename to prevent Path Traversal attacks.
|
|
|
|
Args:
|
|
filename: The original filename from the upload
|
|
input_dir: The target input directory
|
|
|
|
Returns:
|
|
str: Sanitized filename that is safe to use
|
|
|
|
Raises:
|
|
HTTPException: If the filename is unsafe or invalid
|
|
"""
|
|
# Basic validation
|
|
if not filename or not filename.strip():
|
|
raise HTTPException(status_code=400, detail='Filename cannot be empty')
|
|
|
|
# Remove path separators and traversal sequences
|
|
clean_name = filename.replace('/', '').replace('\\', '')
|
|
clean_name = clean_name.replace('..', '')
|
|
|
|
# Remove control characters and null bytes
|
|
clean_name = ''.join(c for c in clean_name if ord(c) >= 32 and c != '\x7f')
|
|
|
|
# Remove leading/trailing whitespace and dots
|
|
clean_name = clean_name.strip().strip('.')
|
|
|
|
# Check if anything is left after sanitization
|
|
if not clean_name:
|
|
raise HTTPException(status_code=400, detail='Invalid filename after sanitization')
|
|
|
|
# Verify the final path stays within the input directory
|
|
try:
|
|
final_path = (input_dir / clean_name).resolve()
|
|
if not final_path.is_relative_to(input_dir.resolve()):
|
|
raise HTTPException(status_code=400, detail='Unsafe filename detected')
|
|
except (OSError, ValueError):
|
|
raise HTTPException(status_code=400, detail='Invalid filename') from None
|
|
|
|
return clean_name
|
|
|
|
|
|
class ScanResponse(BaseModel):
|
|
"""Response model for document scanning operation
|
|
|
|
Attributes:
|
|
status: Status of the scanning operation
|
|
message: Optional message with additional details
|
|
track_id: Tracking ID for monitoring scanning progress
|
|
"""
|
|
|
|
status: Literal['scanning_started'] = Field(description='Status of the scanning operation')
|
|
message: str | None = Field(default=None, description='Additional details about the scanning operation')
|
|
track_id: str = Field(description='Tracking ID for monitoring scanning progress')
|
|
|
|
class Config:
|
|
json_schema_extra: ClassVar[dict[str, Any]] = {
|
|
'example': {
|
|
'status': 'scanning_started',
|
|
'message': 'Scanning process has been initiated in the background',
|
|
'track_id': 'scan_20250729_170612_abc123',
|
|
}
|
|
}
|
|
|
|
|
|
class ReprocessResponse(BaseModel):
|
|
"""Response model for reprocessing failed documents operation
|
|
|
|
Attributes:
|
|
status: Status of the reprocessing operation
|
|
message: Message describing the operation result
|
|
track_id: Always empty string. Reprocessed documents retain their original track_id.
|
|
"""
|
|
|
|
status: Literal['reprocessing_started'] = Field(description='Status of the reprocessing operation')
|
|
message: str = Field(description='Human-readable message describing the operation')
|
|
track_id: str = Field(
|
|
default='',
|
|
description='Always empty string. Reprocessed documents retain their original track_id from initial upload.',
|
|
)
|
|
|
|
class Config:
|
|
json_schema_extra: ClassVar[dict[str, Any]] = {
|
|
'example': {
|
|
'status': 'reprocessing_started',
|
|
'message': 'Reprocessing of failed documents has been initiated in background',
|
|
'track_id': '',
|
|
}
|
|
}
|
|
|
|
|
|
class CancelPipelineResponse(BaseModel):
|
|
"""Response model for pipeline cancellation operation
|
|
|
|
Attributes:
|
|
status: Status of the cancellation request
|
|
message: Message describing the operation result
|
|
"""
|
|
|
|
status: Literal['cancellation_requested', 'not_busy'] = Field(description='Status of the cancellation request')
|
|
message: str = Field(description='Human-readable message describing the operation')
|
|
|
|
class Config:
|
|
json_schema_extra: ClassVar[dict[str, Any]] = {
|
|
'example': {
|
|
'status': 'cancellation_requested',
|
|
'message': 'Pipeline cancellation has been requested. Documents will be marked as FAILED.',
|
|
}
|
|
}
|
|
|
|
|
|
class InsertTextRequest(BaseModel):
|
|
"""Request model for inserting a single text document
|
|
|
|
Attributes:
|
|
text: The text content to be inserted into the RAG system
|
|
file_source: Source of the text (optional)
|
|
"""
|
|
|
|
text: str = Field(
|
|
min_length=1,
|
|
description='The text to insert',
|
|
)
|
|
file_source: str | None = Field(default=None, min_length=0, description='File Source')
|
|
|
|
@field_validator('text', mode='after')
|
|
@classmethod
|
|
def strip_text_after(cls, text: str) -> str:
|
|
return text.strip()
|
|
|
|
@field_validator('file_source', mode='after')
|
|
@classmethod
|
|
def strip_source_after(cls, file_source: str | None) -> str | None:
|
|
if file_source is None:
|
|
return None
|
|
return file_source.strip()
|
|
|
|
class Config:
|
|
json_schema_extra: ClassVar[dict[str, Any]] = {
|
|
'example': {
|
|
'text': 'This is a sample text to be inserted into the RAG system.',
|
|
'file_source': 'Source of the text (optional)',
|
|
}
|
|
}
|
|
|
|
|
|
class InsertTextsRequest(BaseModel):
|
|
"""Request model for inserting multiple text documents
|
|
|
|
Attributes:
|
|
texts: List of text contents to be inserted into the RAG system
|
|
file_sources: Sources of the texts (optional)
|
|
"""
|
|
|
|
texts: list[str] = Field(
|
|
min_length=1,
|
|
description='The texts to insert',
|
|
)
|
|
file_sources: list[str] | None = Field(default=None, min_length=0, description='Sources of the texts')
|
|
|
|
@field_validator('texts', mode='after')
|
|
@classmethod
|
|
def strip_texts_after(cls, texts: list[str]) -> list[str]:
|
|
return [text.strip() for text in texts]
|
|
|
|
@field_validator('file_sources', mode='after')
|
|
@classmethod
|
|
def strip_sources_after(cls, file_sources: list[str] | None) -> list[str] | None:
|
|
if file_sources is None:
|
|
return None
|
|
return [file_source.strip() for file_source in file_sources]
|
|
|
|
class Config:
|
|
json_schema_extra: ClassVar[dict[str, Any]] = {
|
|
'example': {
|
|
'texts': [
|
|
'This is the first text to be inserted.',
|
|
'This is the second text to be inserted.',
|
|
],
|
|
'file_sources': [
|
|
'First file source (optional)',
|
|
],
|
|
}
|
|
}
|
|
|
|
|
|
class InsertResponse(BaseModel):
|
|
"""Response model for document insertion operations
|
|
|
|
Attributes:
|
|
status: Status of the operation (success, duplicated, partial_success, failure)
|
|
message: Detailed message describing the operation result
|
|
track_id: Tracking ID for monitoring processing status
|
|
"""
|
|
|
|
status: Literal['success', 'duplicated', 'partial_success', 'failure'] = Field(
|
|
description='Status of the operation'
|
|
)
|
|
message: str = Field(description='Message describing the operation result')
|
|
track_id: str = Field(description='Tracking ID for monitoring processing status')
|
|
|
|
class Config:
|
|
json_schema_extra: ClassVar[dict[str, Any]] = {
|
|
'example': {
|
|
'status': 'success',
|
|
'message': "File 'document.pdf' uploaded successfully. Processing will continue in background.",
|
|
'track_id': 'upload_20250729_170612_abc123',
|
|
}
|
|
}
|
|
|
|
|
|
class ClearDocumentsResponse(BaseModel):
|
|
"""Response model for document clearing operation
|
|
|
|
Attributes:
|
|
status: Status of the clear operation
|
|
message: Detailed message describing the operation result
|
|
"""
|
|
|
|
status: Literal['success', 'partial_success', 'busy', 'fail'] = Field(description='Status of the clear operation')
|
|
message: str = Field(description='Message describing the operation result')
|
|
|
|
class Config:
|
|
json_schema_extra: ClassVar[dict[str, Any]] = {
|
|
'example': {
|
|
'status': 'success',
|
|
'message': 'All documents cleared successfully. Deleted 15 files.',
|
|
}
|
|
}
|
|
|
|
|
|
class ClearCacheRequest(BaseModel):
|
|
"""Request model for clearing cache
|
|
|
|
This model is kept for API compatibility but no longer accepts any parameters.
|
|
All cache will be cleared regardless of the request content.
|
|
"""
|
|
|
|
class Config:
|
|
json_schema_extra: ClassVar[dict[str, Any]] = {'example': {}}
|
|
|
|
|
|
class ClearCacheResponse(BaseModel):
|
|
"""Response model for cache clearing operation
|
|
|
|
Attributes:
|
|
status: Status of the clear operation
|
|
message: Detailed message describing the operation result
|
|
"""
|
|
|
|
status: Literal['success', 'fail'] = Field(description='Status of the clear operation')
|
|
message: str = Field(description='Message describing the operation result')
|
|
|
|
class Config:
|
|
json_schema_extra: ClassVar[dict[str, Any]] = {
|
|
'example': {
|
|
'status': 'success',
|
|
'message': "Successfully cleared cache for modes: ['default', 'naive']",
|
|
}
|
|
}
|
|
|
|
|
|
"""Response model for document status
|
|
|
|
Attributes:
|
|
id: Document identifier
|
|
content_summary: Summary of document content
|
|
content_length: Length of document content
|
|
status: Current processing status
|
|
created_at: Creation timestamp (ISO format string)
|
|
updated_at: Last update timestamp (ISO format string)
|
|
chunks_count: Number of chunks (optional)
|
|
error: Error message if any (optional)
|
|
metadata: Additional metadata (optional)
|
|
file_path: Path to the document file
|
|
"""
|
|
|
|
|
|
class DeleteDocRequest(BaseModel):
|
|
doc_ids: list[str] = Field(..., description='The IDs of the documents to delete.')
|
|
delete_file: bool = Field(
|
|
default=False,
|
|
description='Whether to delete the corresponding file in the upload directory.',
|
|
)
|
|
delete_llm_cache: bool = Field(
|
|
default=False,
|
|
description='Whether to delete cached LLM extraction results for the documents.',
|
|
)
|
|
|
|
@field_validator('doc_ids', mode='after')
|
|
@classmethod
|
|
def validate_doc_ids(cls, doc_ids: list[str]) -> list[str]:
|
|
if not doc_ids:
|
|
raise ValueError('Document IDs list cannot be empty')
|
|
|
|
validated_ids = []
|
|
for doc_id in doc_ids:
|
|
if not doc_id or not doc_id.strip():
|
|
raise ValueError('Document ID cannot be empty')
|
|
validated_ids.append(doc_id.strip())
|
|
|
|
# Check for duplicates
|
|
if len(validated_ids) != len(set(validated_ids)):
|
|
raise ValueError('Document IDs must be unique')
|
|
|
|
return validated_ids
|
|
|
|
|
|
class DeleteEntityRequest(BaseModel):
|
|
entity_name: str = Field(..., description='The name of the entity to delete.')
|
|
|
|
@field_validator('entity_name', mode='after')
|
|
@classmethod
|
|
def validate_entity_name(cls, entity_name: str) -> str:
|
|
if not entity_name or not entity_name.strip():
|
|
raise ValueError('Entity name cannot be empty')
|
|
return entity_name.strip()
|
|
|
|
|
|
class DeleteRelationRequest(BaseModel):
|
|
source_entity: str = Field(..., description='The name of the source entity.')
|
|
target_entity: str = Field(..., description='The name of the target entity.')
|
|
|
|
@field_validator('source_entity', 'target_entity', mode='after')
|
|
@classmethod
|
|
def validate_entity_names(cls, entity_name: str) -> str:
|
|
if not entity_name or not entity_name.strip():
|
|
raise ValueError('Entity name cannot be empty')
|
|
return entity_name.strip()
|
|
|
|
|
|
class DocStatusResponse(BaseModel):
|
|
id: str = Field(description='Document identifier')
|
|
content_summary: str = Field(description='Summary of document content')
|
|
content_length: int = Field(description='Length of document content in characters')
|
|
status: DocStatus = Field(description='Current processing status')
|
|
created_at: str | None = Field(default=None, description='Creation timestamp (ISO format string)')
|
|
updated_at: str | None = Field(default=None, description='Last update timestamp (ISO format string)')
|
|
track_id: str | None = Field(default=None, description='Tracking ID for monitoring progress')
|
|
chunks_count: int | None = Field(default=None, description='Number of chunks the document was split into')
|
|
error_msg: str | None = Field(default=None, description='Error message if processing failed')
|
|
metadata: dict[str, Any] | None = Field(default=None, description='Additional metadata about the document')
|
|
file_path: str | None = Field(default=None, description='Path to the document file')
|
|
s3_key: str | None = Field(default=None, description='S3 storage key for archived documents')
|
|
|
|
class Config:
|
|
json_schema_extra: ClassVar[dict[str, Any]] = {
|
|
'example': {
|
|
'id': 'doc_123456',
|
|
'content_summary': 'Research paper on machine learning',
|
|
'content_length': 15240,
|
|
'status': 'processed',
|
|
'created_at': '2025-03-31T12:34:56',
|
|
'updated_at': '2025-03-31T12:35:30',
|
|
'track_id': 'upload_20250729_170612_abc123',
|
|
'chunks_count': 12,
|
|
'error_msg': None,
|
|
'metadata': {'author': 'John Doe', 'year': 2025},
|
|
'file_path': 's3://lightrag/archive/default/doc_123456/research_paper.pdf',
|
|
's3_key': 'archive/default/doc_123456/research_paper.pdf',
|
|
}
|
|
}
|
|
|
|
|
|
class DocsStatusesResponse(BaseModel):
|
|
"""Response model for document statuses
|
|
|
|
Attributes:
|
|
statuses: Dictionary mapping document status to lists of document status responses
|
|
"""
|
|
|
|
statuses: dict[DocStatus, list[DocStatusResponse]] = Field(
|
|
default_factory=dict,
|
|
description='Dictionary mapping document status to lists of document status responses',
|
|
)
|
|
|
|
class Config:
|
|
json_schema_extra: ClassVar[dict[str, Any]] = {
|
|
'example': {
|
|
'statuses': {
|
|
'PENDING': [
|
|
{
|
|
'id': 'doc_123',
|
|
'content_summary': 'Pending document',
|
|
'content_length': 5000,
|
|
'status': 'pending',
|
|
'created_at': '2025-03-31T10:00:00',
|
|
'updated_at': '2025-03-31T10:00:00',
|
|
'track_id': 'upload_20250331_100000_abc123',
|
|
'chunks_count': None,
|
|
'error': None,
|
|
'metadata': None,
|
|
'file_path': 'pending_doc.pdf',
|
|
}
|
|
],
|
|
'PREPROCESSED': [
|
|
{
|
|
'id': 'doc_789',
|
|
'content_summary': 'Document pending final indexing',
|
|
'content_length': 7200,
|
|
'status': 'preprocessed',
|
|
'created_at': '2025-03-31T09:30:00',
|
|
'updated_at': '2025-03-31T09:35:00',
|
|
'track_id': 'upload_20250331_093000_xyz789',
|
|
'chunks_count': 10,
|
|
'error': None,
|
|
'metadata': None,
|
|
'file_path': 'preprocessed_doc.pdf',
|
|
}
|
|
],
|
|
'PROCESSED': [
|
|
{
|
|
'id': 'doc_456',
|
|
'content_summary': 'Processed document',
|
|
'content_length': 8000,
|
|
'status': 'processed',
|
|
'created_at': '2025-03-31T09:00:00',
|
|
'updated_at': '2025-03-31T09:05:00',
|
|
'track_id': 'insert_20250331_090000_def456',
|
|
'chunks_count': 8,
|
|
'error': None,
|
|
'metadata': {'author': 'John Doe'},
|
|
'file_path': 'processed_doc.pdf',
|
|
}
|
|
],
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
class TrackStatusResponse(BaseModel):
|
|
"""Response model for tracking document processing status by track_id
|
|
|
|
Attributes:
|
|
track_id: The tracking ID
|
|
documents: List of documents associated with this track_id
|
|
total_count: Total number of documents for this track_id
|
|
status_summary: Count of documents by status
|
|
"""
|
|
|
|
track_id: str = Field(description='The tracking ID')
|
|
documents: list[DocStatusResponse] = Field(description='List of documents associated with this track_id')
|
|
total_count: int = Field(description='Total number of documents for this track_id')
|
|
status_summary: dict[str, int] = Field(description='Count of documents by status')
|
|
|
|
class Config:
|
|
json_schema_extra: ClassVar[dict[str, Any]] = {
|
|
'example': {
|
|
'track_id': 'upload_20250729_170612_abc123',
|
|
'documents': [
|
|
{
|
|
'id': 'doc_123456',
|
|
'content_summary': 'Research paper on machine learning',
|
|
'content_length': 15240,
|
|
'status': 'PROCESSED',
|
|
'created_at': '2025-03-31T12:34:56',
|
|
'updated_at': '2025-03-31T12:35:30',
|
|
'track_id': 'upload_20250729_170612_abc123',
|
|
'chunks_count': 12,
|
|
'error_msg': None,
|
|
'metadata': {'author': 'John Doe', 'year': 2025},
|
|
'file_path': 'research_paper.pdf',
|
|
}
|
|
],
|
|
'total_count': 1,
|
|
'status_summary': {'PROCESSED': 1},
|
|
}
|
|
}
|
|
|
|
|
|
class DocumentsRequest(BaseModel):
|
|
"""Request model for paginated document queries
|
|
|
|
Attributes:
|
|
status_filter: Filter by document status, None for all statuses
|
|
page: Page number (1-based)
|
|
page_size: Number of documents per page (10-200)
|
|
sort_field: Field to sort by ('created_at', 'updated_at', 'id', 'file_path')
|
|
sort_direction: Sort direction ('asc' or 'desc')
|
|
"""
|
|
|
|
status_filter: DocStatus | None = Field(
|
|
default=None, description='Filter by document status, None for all statuses'
|
|
)
|
|
page: int = Field(default=1, ge=1, description='Page number (1-based)')
|
|
page_size: int = Field(default=50, ge=10, le=200, description='Number of documents per page (10-200)')
|
|
sort_field: Literal['created_at', 'updated_at', 'id', 'file_path'] = Field(
|
|
default='updated_at', description='Field to sort by'
|
|
)
|
|
sort_direction: Literal['asc', 'desc'] = Field(default='desc', description='Sort direction')
|
|
|
|
class Config:
|
|
json_schema_extra: ClassVar[dict[str, Any]] = {
|
|
'example': {
|
|
'status_filter': 'PROCESSED',
|
|
'page': 1,
|
|
'page_size': 50,
|
|
'sort_field': 'updated_at',
|
|
'sort_direction': 'desc',
|
|
}
|
|
}
|
|
|
|
|
|
class PaginationInfo(BaseModel):
|
|
"""Pagination information
|
|
|
|
Attributes:
|
|
page: Current page number
|
|
page_size: Number of items per page
|
|
total_count: Total number of items
|
|
total_pages: Total number of pages
|
|
has_next: Whether there is a next page
|
|
has_prev: Whether there is a previous page
|
|
"""
|
|
|
|
page: int = Field(description='Current page number')
|
|
page_size: int = Field(description='Number of items per page')
|
|
total_count: int = Field(description='Total number of items')
|
|
total_pages: int = Field(description='Total number of pages')
|
|
has_next: bool = Field(description='Whether there is a next page')
|
|
has_prev: bool = Field(description='Whether there is a previous page')
|
|
|
|
class Config:
|
|
json_schema_extra: ClassVar[dict[str, Any]] = {
|
|
'example': {
|
|
'page': 1,
|
|
'page_size': 50,
|
|
'total_count': 150,
|
|
'total_pages': 3,
|
|
'has_next': True,
|
|
'has_prev': False,
|
|
}
|
|
}
|
|
|
|
|
|
class PaginatedDocsResponse(BaseModel):
|
|
"""Response model for paginated document queries
|
|
|
|
Attributes:
|
|
documents: List of documents for the current page
|
|
pagination: Pagination information
|
|
status_counts: Count of documents by status for all documents
|
|
"""
|
|
|
|
documents: list[DocStatusResponse] = Field(description='List of documents for the current page')
|
|
pagination: PaginationInfo = Field(description='Pagination information')
|
|
status_counts: dict[str, int] = Field(description='Count of documents by status for all documents')
|
|
|
|
class Config:
|
|
json_schema_extra: ClassVar[dict[str, Any]] = {
|
|
'example': {
|
|
'documents': [
|
|
{
|
|
'id': 'doc_123456',
|
|
'content_summary': 'Research paper on machine learning',
|
|
'content_length': 15240,
|
|
'status': 'PROCESSED',
|
|
'created_at': '2025-03-31T12:34:56',
|
|
'updated_at': '2025-03-31T12:35:30',
|
|
'track_id': 'upload_20250729_170612_abc123',
|
|
'chunks_count': 12,
|
|
'error_msg': None,
|
|
'metadata': {'author': 'John Doe', 'year': 2025},
|
|
'file_path': 'research_paper.pdf',
|
|
}
|
|
],
|
|
'pagination': {
|
|
'page': 1,
|
|
'page_size': 50,
|
|
'total_count': 150,
|
|
'total_pages': 3,
|
|
'has_next': True,
|
|
'has_prev': False,
|
|
},
|
|
'status_counts': {
|
|
'PENDING': 10,
|
|
'PROCESSING': 5,
|
|
'PREPROCESSED': 5,
|
|
'PROCESSED': 130,
|
|
'FAILED': 5,
|
|
},
|
|
}
|
|
}
|
|
|
|
|
|
class StatusCountsResponse(BaseModel):
|
|
"""Response model for document status counts
|
|
|
|
Attributes:
|
|
status_counts: Count of documents by status
|
|
"""
|
|
|
|
status_counts: dict[str, int] = Field(description='Count of documents by status')
|
|
|
|
class Config:
|
|
json_schema_extra: ClassVar[dict[str, Any]] = {
|
|
'example': {
|
|
'status_counts': {
|
|
'PENDING': 10,
|
|
'PROCESSING': 5,
|
|
'PREPROCESSED': 5,
|
|
'PROCESSED': 130,
|
|
'FAILED': 5,
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
class PipelineStatusResponse(BaseModel):
|
|
"""Response model for pipeline status
|
|
|
|
Attributes:
|
|
autoscanned: Whether auto-scan has started
|
|
busy: Whether the pipeline is currently busy
|
|
job_name: Current job name (e.g., indexing files/indexing texts)
|
|
job_start: Job start time as ISO format string with timezone (optional)
|
|
docs: Total number of documents to be indexed
|
|
batchs: Number of batches for processing documents
|
|
cur_batch: Current processing batch
|
|
request_pending: Flag for pending request for processing
|
|
latest_message: Latest message from pipeline processing
|
|
history_messages: List of history messages
|
|
update_status: Status of update flags for all namespaces
|
|
"""
|
|
|
|
autoscanned: bool = False
|
|
busy: bool = False
|
|
job_name: str = 'Default Job'
|
|
job_start: str | None = None
|
|
docs: int = 0
|
|
batchs: int = 0
|
|
cur_batch: int = 0
|
|
request_pending: bool = False
|
|
latest_message: str = ''
|
|
history_messages: list[str] | None = None
|
|
update_status: dict | None = None
|
|
|
|
@field_validator('job_start', mode='before')
|
|
@classmethod
|
|
def parse_job_start(cls, value):
|
|
"""Process datetime and return as ISO format string with timezone"""
|
|
return format_datetime(value)
|
|
|
|
class Config:
|
|
extra = 'allow' # Allow additional fields from the pipeline status
|
|
|
|
|
|
class DocumentManager:
|
|
def __init__(
|
|
self,
|
|
input_dir: str,
|
|
workspace: str = '', # New parameter for workspace isolation
|
|
supported_extensions: tuple = (
|
|
'.txt',
|
|
'.md',
|
|
'.pdf',
|
|
'.docx',
|
|
'.pptx',
|
|
'.xlsx',
|
|
'.rtf', # Rich Text Format
|
|
'.odt', # OpenDocument Text
|
|
'.tex', # LaTeX
|
|
'.epub', # Electronic Publication
|
|
'.html', # HyperText Markup Language
|
|
'.htm', # HyperText Markup Language
|
|
'.csv', # Comma-Separated Values
|
|
'.json', # JavaScript Object Notation
|
|
'.xml', # eXtensible Markup Language
|
|
'.yaml', # YAML Ain't Markup Language
|
|
'.yml', # YAML
|
|
'.log', # Log files
|
|
'.conf', # Configuration files
|
|
'.ini', # Initialization files
|
|
'.properties', # Java properties files
|
|
'.sql', # SQL scripts
|
|
'.bat', # Batch files
|
|
'.sh', # Shell scripts
|
|
'.c', # C source code
|
|
'.cpp', # C++ source code
|
|
'.py', # Python source code
|
|
'.java', # Java source code
|
|
'.js', # JavaScript source code
|
|
'.ts', # TypeScript source code
|
|
'.swift', # Swift source code
|
|
'.go', # Go source code
|
|
'.rb', # Ruby source code
|
|
'.php', # PHP source code
|
|
'.css', # Cascading Style Sheets
|
|
'.scss', # Sassy CSS
|
|
'.less', # LESS CSS
|
|
),
|
|
):
|
|
# Store the base input directory and workspace
|
|
self.base_input_dir = Path(input_dir)
|
|
self.workspace = workspace
|
|
self.supported_extensions = supported_extensions
|
|
self.indexed_files = set()
|
|
|
|
# Create workspace-specific input directory
|
|
# If workspace is provided, create a subdirectory for data isolation
|
|
if workspace:
|
|
self.input_dir = self.base_input_dir / workspace
|
|
else:
|
|
self.input_dir = self.base_input_dir
|
|
|
|
# Create input directory if it doesn't exist
|
|
self.input_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
def scan_directory_for_new_files(self) -> list[Path]:
|
|
"""Scan input directory for new files"""
|
|
new_files = []
|
|
for ext in self.supported_extensions:
|
|
logger.debug(f'Scanning for {ext} files in {self.input_dir}')
|
|
for file_path in self.input_dir.glob(f'*{ext}'):
|
|
if file_path not in self.indexed_files:
|
|
new_files.append(file_path)
|
|
return new_files
|
|
|
|
def mark_as_indexed(self, file_path: Path):
|
|
self.indexed_files.add(file_path)
|
|
|
|
def is_supported_file(self, filename: str) -> bool:
|
|
return any(filename.lower().endswith(ext) for ext in self.supported_extensions)
|
|
|
|
|
|
def validate_file_path_security(file_path_str: str, base_dir: Path) -> Path | None:
|
|
"""
|
|
Validate file path security to prevent Path Traversal attacks.
|
|
|
|
Args:
|
|
file_path_str: The file path string to validate
|
|
base_dir: The base directory that the file must be within
|
|
|
|
Returns:
|
|
Path: Safe file path if valid, None if unsafe or invalid
|
|
"""
|
|
if not file_path_str or not file_path_str.strip():
|
|
return None
|
|
|
|
try:
|
|
# Clean the file path string
|
|
clean_path_str = file_path_str.strip()
|
|
|
|
# Check for obvious path traversal patterns before processing
|
|
# This catches both Unix (..) and Windows (..\) style traversals
|
|
if '..' in clean_path_str and (
|
|
'\\..\\' in clean_path_str or clean_path_str.startswith('..\\') or clean_path_str.endswith('\\..')
|
|
):
|
|
# logger.warning(
|
|
# f"Security violation: Windows path traversal attempt detected - {file_path_str}"
|
|
# )
|
|
return None
|
|
|
|
# Normalize path separators (convert backslashes to forward slashes)
|
|
# This helps handle Windows-style paths on Unix systems
|
|
normalized_path = clean_path_str.replace('\\', '/')
|
|
|
|
# Create path object and resolve it (handles symlinks and relative paths)
|
|
candidate_path = (base_dir / normalized_path).resolve()
|
|
base_dir_resolved = base_dir.resolve()
|
|
|
|
# Check if the resolved path is within the base directory
|
|
if not candidate_path.is_relative_to(base_dir_resolved):
|
|
# logger.warning(
|
|
# f"Security violation: Path traversal attempt detected - {file_path_str}"
|
|
# )
|
|
return None
|
|
|
|
return candidate_path
|
|
|
|
except (OSError, ValueError, Exception) as e:
|
|
logger.warning(f'Invalid file path detected: {file_path_str} - {e!s}')
|
|
return None
|
|
|
|
|
|
def get_unique_filename_in_enqueued(target_dir: Path, original_name: str) -> str:
|
|
"""Generate a unique filename in the target directory by adding numeric suffixes if needed
|
|
|
|
Args:
|
|
target_dir: Target directory path
|
|
original_name: Original filename
|
|
|
|
Returns:
|
|
str: Unique filename (may have numeric suffix added)
|
|
"""
|
|
import time
|
|
|
|
original_path = Path(original_name)
|
|
base_name = original_path.stem
|
|
extension = original_path.suffix
|
|
|
|
# Try original name first
|
|
if not (target_dir / original_name).exists():
|
|
return original_name
|
|
|
|
# Try with numeric suffixes 001-999
|
|
for i in range(1, 1000):
|
|
suffix = f'{i:03d}'
|
|
new_name = f'{base_name}_{suffix}{extension}'
|
|
if not (target_dir / new_name).exists():
|
|
return new_name
|
|
|
|
# Fallback with timestamp if all 999 slots are taken
|
|
timestamp = int(time.time())
|
|
return f'{base_name}_{timestamp}{extension}'
|
|
|
|
|
|
# Document processing helper functions (synchronous)
|
|
# These functions run in thread pool via asyncio.to_thread() to avoid blocking the event loop
|
|
|
|
|
|
def _convert_with_docling(file_path: Path) -> str:
|
|
"""Convert document using docling (synchronous).
|
|
|
|
Args:
|
|
file_path: Path to the document file
|
|
|
|
Returns:
|
|
str: Extracted markdown content
|
|
"""
|
|
from docling.document_converter import DocumentConverter # type: ignore
|
|
|
|
converter = DocumentConverter()
|
|
result = converter.convert(file_path)
|
|
return result.document.export_to_markdown()
|
|
|
|
|
|
def _extract_pdf_pypdf(file_bytes: bytes, password: str | None = None) -> str:
|
|
"""Extract PDF content using pypdf (synchronous).
|
|
|
|
Args:
|
|
file_bytes: PDF file content as bytes
|
|
password: Optional password for encrypted PDFs
|
|
|
|
Returns:
|
|
str: Extracted text content
|
|
|
|
Raises:
|
|
Exception: If PDF is encrypted and password is incorrect or missing
|
|
"""
|
|
from pypdf import PdfReader # type: ignore
|
|
|
|
pdf_file = BytesIO(file_bytes)
|
|
reader = PdfReader(pdf_file)
|
|
|
|
# Check if PDF is encrypted
|
|
if reader.is_encrypted:
|
|
if not password:
|
|
raise Exception('PDF is encrypted but no password provided')
|
|
|
|
decrypt_result = reader.decrypt(password)
|
|
if decrypt_result == 0:
|
|
raise Exception('Incorrect PDF password')
|
|
|
|
# Extract text from all pages
|
|
content = ''
|
|
for page in reader.pages:
|
|
content += page.extract_text() + '\n'
|
|
|
|
return content
|
|
|
|
|
|
def _extract_docx(file_bytes: bytes) -> str:
|
|
"""Extract DOCX content including tables in document order (synchronous).
|
|
|
|
Args:
|
|
file_bytes: DOCX file content as bytes
|
|
|
|
Returns:
|
|
str: Extracted text content with tables in their original positions.
|
|
Tables are separated from paragraphs with blank lines for clarity.
|
|
"""
|
|
from docx import Document # type: ignore
|
|
from docx.table import Table # type: ignore
|
|
from docx.text.paragraph import Paragraph # type: ignore
|
|
|
|
docx_file = BytesIO(file_bytes)
|
|
doc = Document(docx_file)
|
|
|
|
def escape_cell(cell_value: object | None) -> str:
|
|
"""Escape characters that would break tab-delimited layout.
|
|
|
|
Escape order is critical: backslashes first, then tabs/newlines.
|
|
This prevents double-escaping issues.
|
|
|
|
Args:
|
|
cell_value: The cell value to escape (can be None or str)
|
|
|
|
Returns:
|
|
str: Escaped cell value safe for tab-delimited format
|
|
"""
|
|
if cell_value is None:
|
|
return ''
|
|
text = str(cell_value)
|
|
# CRITICAL: Escape backslash first to avoid double-escaping
|
|
return (
|
|
text.replace('\\', '\\\\') # Must be first: \ -> \\
|
|
.replace('\t', '\\t') # Tab -> \t (visible)
|
|
.replace('\r\n', '\\n') # Windows newline -> \n
|
|
.replace('\r', '\\n') # Mac newline -> \n
|
|
.replace('\n', '\\n') # Unix newline -> \n
|
|
)
|
|
|
|
content_parts = []
|
|
in_table = False # Track if we're currently processing a table
|
|
|
|
# Iterate through all body elements in document order
|
|
for element in doc.element.body:
|
|
# Check if element is a paragraph
|
|
if element.tag.endswith('p'):
|
|
# If coming out of a table, add blank line after table
|
|
if in_table:
|
|
content_parts.append('') # Blank line after table
|
|
in_table = False
|
|
|
|
paragraph = Paragraph(element, doc)
|
|
text = paragraph.text
|
|
# Always append to preserve document spacing (including blank paragraphs)
|
|
content_parts.append(text)
|
|
|
|
# Check if element is a table
|
|
elif element.tag.endswith('tbl'):
|
|
# Add blank line before table (if content exists)
|
|
if content_parts and not in_table:
|
|
content_parts.append('') # Blank line before table
|
|
|
|
in_table = True
|
|
table = Table(element, doc)
|
|
for row in table.rows:
|
|
row_text = []
|
|
for cell in row.cells:
|
|
cell_text = cell.text
|
|
# Escape special characters to preserve tab-delimited structure
|
|
row_text.append(escape_cell(cell_text))
|
|
# Only add row if at least one cell has content
|
|
if any(cell for cell in row_text):
|
|
content_parts.append('\t'.join(row_text))
|
|
|
|
return '\n'.join(content_parts)
|
|
|
|
|
|
def _extract_pptx(file_bytes: bytes) -> str:
|
|
"""Extract PPTX content (synchronous).
|
|
|
|
Args:
|
|
file_bytes: PPTX file content as bytes
|
|
|
|
Returns:
|
|
str: Extracted text content
|
|
"""
|
|
from typing import Any, cast
|
|
|
|
from pptx import Presentation # type: ignore
|
|
|
|
pptx_file = BytesIO(file_bytes)
|
|
prs = Presentation(pptx_file)
|
|
content = ''
|
|
for slide in prs.slides:
|
|
for shape in slide.shapes:
|
|
if hasattr(shape, 'text'):
|
|
content += shape.text + '\n' # type: ignore
|
|
return content
|
|
|
|
|
|
def _extract_xlsx(file_bytes: bytes) -> str:
|
|
"""Extract XLSX content in tab-delimited format with clear sheet separation.
|
|
|
|
This function processes Excel workbooks and converts them to a structured text format
|
|
suitable for LLM prompts and RAG systems. Each sheet is clearly delimited with
|
|
separator lines, and special characters are escaped to preserve the tab-delimited structure.
|
|
|
|
Features:
|
|
- Each sheet is wrapped with '====================' separators for visual distinction
|
|
- Special characters (tabs, newlines, backslashes) are escaped to prevent structure corruption
|
|
- Column alignment is preserved across all rows to maintain tabular structure
|
|
- Empty rows are preserved as blank lines to maintain row structure
|
|
- Uses sheet.max_column to determine column width efficiently
|
|
|
|
Args:
|
|
file_bytes: XLSX file content as bytes
|
|
|
|
Returns:
|
|
str: Extracted text content with all sheets in tab-delimited format.
|
|
Format: Sheet separators, sheet name, then tab-delimited rows.
|
|
|
|
Example output:
|
|
==================== Sheet: Data ====================
|
|
Name\tAge\tCity
|
|
Alice\t30\tNew York
|
|
Bob\t25\tLondon
|
|
|
|
==================== Sheet: Summary ====================
|
|
Total\t2
|
|
====================
|
|
"""
|
|
from openpyxl import load_workbook # type: ignore
|
|
|
|
xlsx_file = BytesIO(file_bytes)
|
|
wb = load_workbook(xlsx_file)
|
|
|
|
def escape_cell(cell_value: object | None) -> str:
|
|
"""Escape characters that would break tab-delimited layout.
|
|
|
|
Escape order is critical: backslashes first, then tabs/newlines.
|
|
This prevents double-escaping issues.
|
|
|
|
Args:
|
|
cell_value: The cell value to escape (can be None, str, int, or float)
|
|
|
|
Returns:
|
|
str: Escaped cell value safe for tab-delimited format
|
|
"""
|
|
if cell_value is None:
|
|
return ''
|
|
text = str(cell_value)
|
|
# CRITICAL: Escape backslash first to avoid double-escaping
|
|
return (
|
|
text.replace('\\', '\\\\') # Must be first: \ -> \\
|
|
.replace('\t', '\\t') # Tab -> \t (visible)
|
|
.replace('\r\n', '\\n') # Windows newline -> \n
|
|
.replace('\r', '\\n') # Mac newline -> \n
|
|
.replace('\n', '\\n') # Unix newline -> \n
|
|
)
|
|
|
|
def escape_sheet_title(title: str) -> str:
|
|
"""Escape sheet title to prevent formatting issues in separators.
|
|
|
|
Args:
|
|
title: Original sheet title
|
|
|
|
Returns:
|
|
str: Sanitized sheet title with tabs/newlines replaced
|
|
"""
|
|
return str(title).replace('\n', ' ').replace('\t', ' ').replace('\r', ' ')
|
|
|
|
content_parts: list[str] = []
|
|
sheet_separator = '=' * 20
|
|
|
|
for idx, sheet in enumerate(wb):
|
|
if idx > 0:
|
|
content_parts.append('') # Blank line between sheets for readability
|
|
|
|
# Escape sheet title to handle edge cases with special characters
|
|
safe_title = escape_sheet_title(sheet.title)
|
|
content_parts.append(f'{sheet_separator} Sheet: {safe_title} {sheet_separator}')
|
|
|
|
# Use sheet.max_column to get the maximum column width directly
|
|
max_columns = sheet.max_column if sheet.max_column else 0
|
|
|
|
# Extract rows with consistent width to preserve column alignment
|
|
for row in sheet.iter_rows(values_only=True):
|
|
row_parts = []
|
|
|
|
# Build row up to max_columns width
|
|
for idx in range(max_columns):
|
|
if idx < len(row):
|
|
row_parts.append(escape_cell(row[idx]))
|
|
else:
|
|
row_parts.append('') # Pad short rows
|
|
|
|
# Check if row is completely empty
|
|
if all(part == '' for part in row_parts):
|
|
# Preserve empty rows as blank lines (maintains row structure)
|
|
content_parts.append('')
|
|
else:
|
|
# Join all columns to maintain consistent column count
|
|
content_parts.append('\t'.join(row_parts))
|
|
|
|
# Final separator for symmetry (makes parsing easier)
|
|
content_parts.append(sheet_separator)
|
|
return '\n'.join(content_parts)
|
|
|
|
|
|
async def pipeline_enqueue_file(rag: LightRAG, file_path: Path, track_id: str | None = None) -> tuple[bool, str]:
|
|
"""Add a file to the queue for processing
|
|
|
|
Args:
|
|
rag: LightRAG instance
|
|
file_path: Path to the saved file
|
|
track_id: Optional tracking ID, if not provided will be generated
|
|
Returns:
|
|
tuple: (success: bool, track_id: str)
|
|
"""
|
|
|
|
# Generate track_id if not provided
|
|
if track_id is None:
|
|
track_id = generate_track_id('unknown')
|
|
|
|
try:
|
|
content = ''
|
|
ext = file_path.suffix.lower()
|
|
file_size = 0
|
|
|
|
# Get file size for error reporting
|
|
try:
|
|
file_size = file_path.stat().st_size
|
|
except Exception:
|
|
file_size = 0
|
|
|
|
file = None
|
|
try:
|
|
async with aiofiles.open(file_path, 'rb') as f:
|
|
file = await f.read()
|
|
except PermissionError as e:
|
|
error_files = [
|
|
{
|
|
'file_path': str(file_path.name),
|
|
'error_description': '[File Extraction]Permission denied - cannot read file',
|
|
'original_error': str(e),
|
|
'file_size': file_size,
|
|
}
|
|
]
|
|
await rag.apipeline_enqueue_error_documents(error_files, track_id)
|
|
logger.error(f'[File Extraction]Permission denied reading file: {file_path.name}')
|
|
return False, track_id
|
|
except FileNotFoundError as e:
|
|
error_files = [
|
|
{
|
|
'file_path': str(file_path.name),
|
|
'error_description': '[File Extraction]File not found',
|
|
'original_error': str(e),
|
|
'file_size': file_size,
|
|
}
|
|
]
|
|
await rag.apipeline_enqueue_error_documents(error_files, track_id)
|
|
logger.error(f'[File Extraction]File not found: {file_path.name}')
|
|
return False, track_id
|
|
except Exception as e:
|
|
error_files = [
|
|
{
|
|
'file_path': str(file_path.name),
|
|
'error_description': '[File Extraction]File reading error',
|
|
'original_error': str(e),
|
|
'file_size': file_size,
|
|
}
|
|
]
|
|
await rag.apipeline_enqueue_error_documents(error_files, track_id)
|
|
logger.error(f'[File Extraction]Error reading file {file_path.name}: {e!s}')
|
|
return False, track_id
|
|
|
|
# Process based on file type
|
|
try:
|
|
match ext:
|
|
case (
|
|
'.txt'
|
|
| '.md'
|
|
| '.html'
|
|
| '.htm'
|
|
| '.tex'
|
|
| '.json'
|
|
| '.xml'
|
|
| '.yaml'
|
|
| '.yml'
|
|
| '.rtf'
|
|
| '.odt'
|
|
| '.epub'
|
|
| '.csv'
|
|
| '.log'
|
|
| '.conf'
|
|
| '.ini'
|
|
| '.properties'
|
|
| '.sql'
|
|
| '.bat'
|
|
| '.sh'
|
|
| '.c'
|
|
| '.cpp'
|
|
| '.py'
|
|
| '.java'
|
|
| '.js'
|
|
| '.ts'
|
|
| '.swift'
|
|
| '.go'
|
|
| '.rb'
|
|
| '.php'
|
|
| '.css'
|
|
| '.scss'
|
|
| '.less'
|
|
):
|
|
try:
|
|
# Try to decode as UTF-8
|
|
content = file.decode('utf-8')
|
|
|
|
# Validate content
|
|
if not content or len(content.strip()) == 0:
|
|
error_files = [
|
|
{
|
|
'file_path': str(file_path.name),
|
|
'error_description': '[File Extraction]Empty file content',
|
|
'original_error': 'File contains no content or only whitespace',
|
|
'file_size': file_size,
|
|
}
|
|
]
|
|
await rag.apipeline_enqueue_error_documents(error_files, track_id)
|
|
logger.error(f'[File Extraction]Empty content in file: {file_path.name}')
|
|
return False, track_id
|
|
|
|
# Check if content looks like binary data string representation
|
|
if content.startswith("b'") or content.startswith('b"'):
|
|
error_files = [
|
|
{
|
|
'file_path': str(file_path.name),
|
|
'error_description': '[File Extraction]Binary data in text file',
|
|
'original_error': 'File appears to contain binary data representation instead of text',
|
|
'file_size': file_size,
|
|
}
|
|
]
|
|
await rag.apipeline_enqueue_error_documents(error_files, track_id)
|
|
logger.error(
|
|
f'[File Extraction]File {file_path.name} appears to contain binary data representation instead of text'
|
|
)
|
|
return False, track_id
|
|
|
|
except UnicodeDecodeError as e:
|
|
error_files = [
|
|
{
|
|
'file_path': str(file_path.name),
|
|
'error_description': '[File Extraction]UTF-8 encoding error, please convert it to UTF-8 before processing',
|
|
'original_error': f'File is not valid UTF-8 encoded text: {e!s}',
|
|
'file_size': file_size,
|
|
}
|
|
]
|
|
await rag.apipeline_enqueue_error_documents(error_files, track_id)
|
|
logger.error(
|
|
f'[File Extraction]File {file_path.name} is not valid UTF-8 encoded text. Please convert it to UTF-8 before processing.'
|
|
)
|
|
return False, track_id
|
|
|
|
case '.pdf':
|
|
try:
|
|
# Try DOCLING first if configured and available
|
|
if global_args.document_loading_engine == 'DOCLING' and _is_docling_available():
|
|
content = await asyncio.to_thread(_convert_with_docling, file_path)
|
|
else:
|
|
if global_args.document_loading_engine == 'DOCLING' and not _is_docling_available():
|
|
logger.warning(
|
|
f'DOCLING engine configured but not available for {file_path.name}. Falling back to pypdf.'
|
|
)
|
|
# Use pypdf (non-blocking via to_thread)
|
|
content = await asyncio.to_thread(
|
|
_extract_pdf_pypdf,
|
|
file,
|
|
global_args.pdf_decrypt_password,
|
|
)
|
|
except Exception as e:
|
|
error_files = [
|
|
{
|
|
'file_path': str(file_path.name),
|
|
'error_description': '[File Extraction]PDF processing error',
|
|
'original_error': f'Failed to extract text from PDF: {e!s}',
|
|
'file_size': file_size,
|
|
}
|
|
]
|
|
await rag.apipeline_enqueue_error_documents(error_files, track_id)
|
|
logger.error(f'[File Extraction]Error processing PDF {file_path.name}: {e!s}')
|
|
return False, track_id
|
|
|
|
case '.docx':
|
|
try:
|
|
# Try DOCLING first if configured and available
|
|
if global_args.document_loading_engine == 'DOCLING' and _is_docling_available():
|
|
content = await asyncio.to_thread(_convert_with_docling, file_path)
|
|
else:
|
|
if global_args.document_loading_engine == 'DOCLING' and not _is_docling_available():
|
|
logger.warning(
|
|
f'DOCLING engine configured but not available for {file_path.name}. Falling back to python-docx.'
|
|
)
|
|
# Use python-docx (non-blocking via to_thread)
|
|
content = await asyncio.to_thread(_extract_docx, file)
|
|
except Exception as e:
|
|
error_files = [
|
|
{
|
|
'file_path': str(file_path.name),
|
|
'error_description': '[File Extraction]DOCX processing error',
|
|
'original_error': f'Failed to extract text from DOCX: {e!s}',
|
|
'file_size': file_size,
|
|
}
|
|
]
|
|
await rag.apipeline_enqueue_error_documents(error_files, track_id)
|
|
logger.error(f'[File Extraction]Error processing DOCX {file_path.name}: {e!s}')
|
|
return False, track_id
|
|
|
|
case '.pptx':
|
|
try:
|
|
# Try DOCLING first if configured and available
|
|
if global_args.document_loading_engine == 'DOCLING' and _is_docling_available():
|
|
content = await asyncio.to_thread(_convert_with_docling, file_path)
|
|
else:
|
|
if global_args.document_loading_engine == 'DOCLING' and not _is_docling_available():
|
|
logger.warning(
|
|
f'DOCLING engine configured but not available for {file_path.name}. Falling back to python-pptx.'
|
|
)
|
|
# Use python-pptx (non-blocking via to_thread)
|
|
content = await asyncio.to_thread(_extract_pptx, file)
|
|
except Exception as e:
|
|
error_files = [
|
|
{
|
|
'file_path': str(file_path.name),
|
|
'error_description': '[File Extraction]PPTX processing error',
|
|
'original_error': f'Failed to extract text from PPTX: {e!s}',
|
|
'file_size': file_size,
|
|
}
|
|
]
|
|
await rag.apipeline_enqueue_error_documents(error_files, track_id)
|
|
logger.error(f'[File Extraction]Error processing PPTX {file_path.name}: {e!s}')
|
|
return False, track_id
|
|
|
|
case '.xlsx':
|
|
try:
|
|
# Try DOCLING first if configured and available
|
|
if global_args.document_loading_engine == 'DOCLING' and _is_docling_available():
|
|
content = await asyncio.to_thread(_convert_with_docling, file_path)
|
|
else:
|
|
if global_args.document_loading_engine == 'DOCLING' and not _is_docling_available():
|
|
logger.warning(
|
|
f'DOCLING engine configured but not available for {file_path.name}. Falling back to openpyxl.'
|
|
)
|
|
# Use openpyxl (non-blocking via to_thread)
|
|
content = await asyncio.to_thread(_extract_xlsx, file)
|
|
except Exception as e:
|
|
error_files = [
|
|
{
|
|
'file_path': str(file_path.name),
|
|
'error_description': '[File Extraction]XLSX processing error',
|
|
'original_error': f'Failed to extract text from XLSX: {e!s}',
|
|
'file_size': file_size,
|
|
}
|
|
]
|
|
await rag.apipeline_enqueue_error_documents(error_files, track_id)
|
|
logger.error(f'[File Extraction]Error processing XLSX {file_path.name}: {e!s}')
|
|
return False, track_id
|
|
|
|
case _:
|
|
error_files = [
|
|
{
|
|
'file_path': str(file_path.name),
|
|
'error_description': f'[File Extraction]Unsupported file type: {ext}',
|
|
'original_error': f'File extension {ext} is not supported',
|
|
'file_size': file_size,
|
|
}
|
|
]
|
|
await rag.apipeline_enqueue_error_documents(error_files, track_id)
|
|
logger.error(f'[File Extraction]Unsupported file type: {file_path.name} (extension {ext})')
|
|
return False, track_id
|
|
|
|
except Exception as e:
|
|
error_files = [
|
|
{
|
|
'file_path': str(file_path.name),
|
|
'error_description': '[File Extraction]File format processing error',
|
|
'original_error': f'Unexpected error during file extracting: {e!s}',
|
|
'file_size': file_size,
|
|
}
|
|
]
|
|
await rag.apipeline_enqueue_error_documents(error_files, track_id)
|
|
logger.error(f'[File Extraction]Unexpected error during {file_path.name} extracting: {e!s}')
|
|
return False, track_id
|
|
|
|
# Insert into the RAG queue
|
|
if content:
|
|
# Check if content contains only whitespace characters
|
|
if not content.strip():
|
|
error_files = [
|
|
{
|
|
'file_path': str(file_path.name),
|
|
'error_description': '[File Extraction]File contains only whitespace',
|
|
'original_error': 'File content contains only whitespace characters',
|
|
'file_size': file_size,
|
|
}
|
|
]
|
|
await rag.apipeline_enqueue_error_documents(error_files, track_id)
|
|
logger.warning(f'[File Extraction]File contains only whitespace characters: {file_path.name}')
|
|
return False, track_id
|
|
|
|
try:
|
|
await rag.apipeline_enqueue_documents(content, file_paths=file_path.name, track_id=track_id)
|
|
|
|
logger.info(f'Successfully extracted and enqueued file: {file_path.name}')
|
|
|
|
# Move file to __enqueued__ directory after enqueuing
|
|
try:
|
|
enqueued_dir = file_path.parent / '__enqueued__'
|
|
enqueued_dir.mkdir(exist_ok=True)
|
|
|
|
# Generate unique filename to avoid conflicts
|
|
unique_filename = get_unique_filename_in_enqueued(enqueued_dir, file_path.name)
|
|
target_path = enqueued_dir / unique_filename
|
|
|
|
# Move the file
|
|
file_path.rename(target_path)
|
|
logger.debug(f'Moved file to enqueued directory: {file_path.name} -> {unique_filename}')
|
|
|
|
except Exception as move_error:
|
|
logger.error(f'Failed to move file {file_path.name} to __enqueued__ directory: {move_error}')
|
|
# Don't affect the main function's success status
|
|
|
|
return True, track_id
|
|
|
|
except Exception as e:
|
|
error_files = [
|
|
{
|
|
'file_path': str(file_path.name),
|
|
'error_description': 'Document enqueue error',
|
|
'original_error': f'Failed to enqueue document: {e!s}',
|
|
'file_size': file_size,
|
|
}
|
|
]
|
|
await rag.apipeline_enqueue_error_documents(error_files, track_id)
|
|
logger.error(f'Error enqueueing document {file_path.name}: {e!s}')
|
|
return False, track_id
|
|
else:
|
|
error_files = [
|
|
{
|
|
'file_path': str(file_path.name),
|
|
'error_description': 'No content extracted',
|
|
'original_error': 'No content could be extracted from file',
|
|
'file_size': file_size,
|
|
}
|
|
]
|
|
await rag.apipeline_enqueue_error_documents(error_files, track_id)
|
|
logger.error(f'No content extracted from file: {file_path.name}')
|
|
return False, track_id
|
|
|
|
except Exception as e:
|
|
# Catch-all for any unexpected errors
|
|
try:
|
|
file_size = file_path.stat().st_size if file_path.exists() else 0
|
|
except Exception:
|
|
file_size = 0
|
|
|
|
error_files = [
|
|
{
|
|
'file_path': str(file_path.name),
|
|
'error_description': 'Unexpected processing error',
|
|
'original_error': f'Unexpected error: {e!s}',
|
|
'file_size': file_size,
|
|
}
|
|
]
|
|
await rag.apipeline_enqueue_error_documents(error_files, track_id)
|
|
logger.error(f'Enqueuing file {file_path.name} error: {e!s}')
|
|
logger.error(traceback.format_exc())
|
|
return False, track_id
|
|
finally:
|
|
if file_path.name.startswith(temp_prefix):
|
|
try:
|
|
file_path.unlink()
|
|
except Exception as e:
|
|
logger.error(f'Error deleting file {file_path}: {e!s}')
|
|
|
|
|
|
async def pipeline_index_file(rag: LightRAG, file_path: Path, track_id: str | None = None):
|
|
"""Index a file with track_id
|
|
|
|
Args:
|
|
rag: LightRAG instance
|
|
file_path: Path to the saved file
|
|
track_id: Optional tracking ID
|
|
"""
|
|
try:
|
|
success, _returned_track_id = await pipeline_enqueue_file(rag, file_path, track_id)
|
|
if success:
|
|
await rag.apipeline_process_enqueue_documents()
|
|
|
|
except Exception as e:
|
|
logger.error(f'Error indexing file {file_path.name}: {e!s}')
|
|
logger.error(traceback.format_exc())
|
|
|
|
|
|
async def pipeline_index_files(rag: LightRAG, file_paths: list[Path], track_id: str | None = None):
|
|
"""Index multiple files sequentially to avoid high CPU load
|
|
|
|
Args:
|
|
rag: LightRAG instance
|
|
file_paths: Paths to the files to index
|
|
track_id: Optional tracking ID to pass to all files
|
|
"""
|
|
if not file_paths:
|
|
return
|
|
try:
|
|
enqueued = False
|
|
|
|
# Use get_pinyin_sort_key for Chinese pinyin sorting
|
|
sorted_file_paths = sorted(file_paths, key=lambda p: get_pinyin_sort_key(str(p)))
|
|
|
|
# Process files sequentially with track_id
|
|
for file_path in sorted_file_paths:
|
|
success, _ = await pipeline_enqueue_file(rag, file_path, track_id)
|
|
if success:
|
|
enqueued = True
|
|
|
|
# Process the queue only if at least one file was successfully enqueued
|
|
if enqueued:
|
|
await rag.apipeline_process_enqueue_documents()
|
|
except Exception as e:
|
|
logger.error(f'Error indexing files: {e!s}')
|
|
logger.error(traceback.format_exc())
|
|
|
|
|
|
async def pipeline_index_texts(
|
|
rag: LightRAG,
|
|
texts: list[str],
|
|
file_sources: list[str] | None = None,
|
|
track_id: str | None = None,
|
|
):
|
|
"""Index a list of texts with track_id
|
|
|
|
Args:
|
|
rag: LightRAG instance
|
|
texts: The texts to index
|
|
file_sources: Sources of the texts
|
|
track_id: Optional tracking ID
|
|
"""
|
|
if not texts:
|
|
return
|
|
if file_sources is not None and len(file_sources) != 0 and len(file_sources) != len(texts):
|
|
for _ in range(len(file_sources), len(texts)):
|
|
file_sources.append('unknown_source')
|
|
await rag.apipeline_enqueue_documents(input=texts, file_paths=file_sources, track_id=track_id)
|
|
await rag.apipeline_process_enqueue_documents()
|
|
|
|
|
|
async def run_scanning_process(rag: LightRAG, doc_manager: DocumentManager, track_id: str | None = None):
|
|
"""Background task to scan and index documents
|
|
|
|
Args:
|
|
rag: LightRAG instance
|
|
doc_manager: DocumentManager instance
|
|
track_id: Optional tracking ID to pass to all scanned files
|
|
"""
|
|
try:
|
|
new_files = doc_manager.scan_directory_for_new_files()
|
|
total_files = len(new_files)
|
|
logger.info(f'Found {total_files} files to index.')
|
|
|
|
if new_files:
|
|
# Check for files with PROCESSED status and filter them out
|
|
valid_files = []
|
|
processed_files = []
|
|
|
|
for file_path in new_files:
|
|
filename = file_path.name
|
|
existing_doc_data = await rag.doc_status.get_doc_by_file_path(filename)
|
|
|
|
if existing_doc_data and existing_doc_data.get('status') == 'processed':
|
|
# File is already PROCESSED, skip it with warning
|
|
processed_files.append(filename)
|
|
logger.warning(f'Skipping already processed file: {filename}')
|
|
else:
|
|
# File is new or in non-PROCESSED status, add to processing list
|
|
valid_files.append(file_path)
|
|
|
|
# Process valid files (new files + non-PROCESSED status files)
|
|
if valid_files:
|
|
await pipeline_index_files(rag, valid_files, track_id)
|
|
if processed_files:
|
|
logger.info(
|
|
f'Scanning process completed: {len(valid_files)} files Processed {len(processed_files)} skipped.'
|
|
)
|
|
else:
|
|
logger.info(f'Scanning process completed: {len(valid_files)} files Processed.')
|
|
else:
|
|
logger.info('No files to process after filtering already processed files.')
|
|
else:
|
|
# No new files to index, check if there are any documents in the queue
|
|
logger.info('No upload file found, check if there are any documents in the queue...')
|
|
await rag.apipeline_process_enqueue_documents()
|
|
|
|
except Exception as e:
|
|
logger.error(f'Error during scanning process: {e!s}')
|
|
logger.error(traceback.format_exc())
|
|
|
|
|
|
async def background_delete_documents(
|
|
rag: LightRAG,
|
|
doc_manager: DocumentManager,
|
|
doc_ids: list[str],
|
|
delete_file: bool = False,
|
|
delete_llm_cache: bool = False,
|
|
):
|
|
"""Background task to delete multiple documents"""
|
|
from lightrag.kg.shared_storage import (
|
|
get_namespace_data,
|
|
get_namespace_lock,
|
|
)
|
|
|
|
pipeline_status = await get_namespace_data('pipeline_status', workspace=rag.workspace)
|
|
pipeline_status_lock = get_namespace_lock('pipeline_status', workspace=rag.workspace)
|
|
|
|
total_docs = len(doc_ids)
|
|
successful_deletions = []
|
|
failed_deletions = []
|
|
|
|
# Double-check pipeline status before proceeding
|
|
async with pipeline_status_lock:
|
|
if pipeline_status.get('busy', False):
|
|
logger.warning('Error: Unexpected pipeline busy state, aborting deletion.')
|
|
return # Abort deletion operation
|
|
|
|
# Set pipeline status to busy for deletion
|
|
pipeline_status.update(
|
|
{
|
|
'busy': True,
|
|
# Job name can not be changed, it's verified in adelete_by_doc_id()
|
|
'job_name': f'Deleting {total_docs} Documents',
|
|
'job_start': datetime.now().isoformat(),
|
|
'docs': total_docs,
|
|
'batchs': total_docs,
|
|
'cur_batch': 0,
|
|
'latest_message': 'Starting document deletion process',
|
|
}
|
|
)
|
|
# Use slice assignment to clear the list in place
|
|
pipeline_status['history_messages'][:] = ['Starting document deletion process']
|
|
if delete_llm_cache:
|
|
pipeline_status['history_messages'].append('LLM cache cleanup requested for this deletion job')
|
|
|
|
try:
|
|
# Loop through each document ID and delete them one by one
|
|
for i, doc_id in enumerate(doc_ids, 1):
|
|
# Check for cancellation at the start of each document deletion
|
|
async with pipeline_status_lock:
|
|
if pipeline_status.get('cancellation_requested', False):
|
|
cancel_msg = f'Deletion cancelled by user at document {i}/{total_docs}. {len(successful_deletions)} deleted, {total_docs - i + 1} remaining.'
|
|
logger.info(cancel_msg)
|
|
pipeline_status['latest_message'] = cancel_msg
|
|
pipeline_status['history_messages'].append(cancel_msg)
|
|
# Add remaining documents to failed list with cancellation reason
|
|
failed_deletions.extend(doc_ids[i - 1 :]) # i-1 because enumerate starts at 1
|
|
break # Exit the loop, remaining documents unchanged
|
|
|
|
start_msg = f'Deleting document {i}/{total_docs}: {doc_id}'
|
|
logger.info(start_msg)
|
|
pipeline_status['cur_batch'] = i
|
|
pipeline_status['latest_message'] = start_msg
|
|
pipeline_status['history_messages'].append(start_msg)
|
|
|
|
file_path = '#'
|
|
try:
|
|
result = await rag.adelete_by_doc_id(doc_id, delete_llm_cache=delete_llm_cache)
|
|
file_path = getattr(result, 'file_path', '-')
|
|
if result.status == 'success':
|
|
successful_deletions.append(doc_id)
|
|
success_msg = f'Document deleted {i}/{total_docs}: {doc_id}[{file_path}]'
|
|
logger.info(success_msg)
|
|
async with pipeline_status_lock:
|
|
pipeline_status['history_messages'].append(success_msg)
|
|
|
|
# Handle file deletion if requested and file_path is available
|
|
if delete_file and result.file_path and result.file_path != 'unknown_source':
|
|
try:
|
|
deleted_files = []
|
|
# SECURITY FIX: Use secure path validation to prevent arbitrary file deletion
|
|
safe_file_path = validate_file_path_security(result.file_path, doc_manager.input_dir)
|
|
|
|
if safe_file_path is None:
|
|
# Security violation detected - log and skip file deletion
|
|
security_msg = (
|
|
f'Security violation: Unsafe file path detected for deletion - {result.file_path}'
|
|
)
|
|
logger.warning(security_msg)
|
|
async with pipeline_status_lock:
|
|
pipeline_status['latest_message'] = security_msg
|
|
pipeline_status['history_messages'].append(security_msg)
|
|
else:
|
|
# check and delete files from input_dir directory
|
|
if safe_file_path.exists():
|
|
try:
|
|
safe_file_path.unlink()
|
|
deleted_files.append(safe_file_path.name)
|
|
file_delete_msg = f'Successfully deleted input_dir file: {result.file_path}'
|
|
logger.info(file_delete_msg)
|
|
async with pipeline_status_lock:
|
|
pipeline_status['latest_message'] = file_delete_msg
|
|
pipeline_status['history_messages'].append(file_delete_msg)
|
|
except Exception as file_error:
|
|
file_error_msg = (
|
|
f'Failed to delete input_dir file {result.file_path}: {file_error!s}'
|
|
)
|
|
logger.debug(file_error_msg)
|
|
async with pipeline_status_lock:
|
|
pipeline_status['latest_message'] = file_error_msg
|
|
pipeline_status['history_messages'].append(file_error_msg)
|
|
|
|
# Also check and delete files from __enqueued__ directory
|
|
enqueued_dir = doc_manager.input_dir / '__enqueued__'
|
|
if enqueued_dir.exists():
|
|
# SECURITY FIX: Validate that the file path is safe before processing
|
|
# Only proceed if the original path validation passed
|
|
base_name = Path(result.file_path).stem
|
|
extension = Path(result.file_path).suffix
|
|
|
|
# Search for exact match and files with numeric suffixes
|
|
for enqueued_file in enqueued_dir.glob(f'{base_name}*{extension}'):
|
|
# Additional security check: ensure enqueued file is within enqueued directory
|
|
safe_enqueued_path = validate_file_path_security(
|
|
enqueued_file.name, enqueued_dir
|
|
)
|
|
if safe_enqueued_path is not None:
|
|
try:
|
|
enqueued_file.unlink()
|
|
deleted_files.append(enqueued_file.name)
|
|
logger.info(f'Successfully deleted enqueued file: {enqueued_file.name}')
|
|
except Exception as enqueued_error:
|
|
file_error_msg = f'Failed to delete enqueued file {enqueued_file.name}: {enqueued_error!s}'
|
|
logger.debug(file_error_msg)
|
|
async with pipeline_status_lock:
|
|
pipeline_status['latest_message'] = file_error_msg
|
|
pipeline_status['history_messages'].append(file_error_msg)
|
|
else:
|
|
security_msg = f'Security violation: Unsafe enqueued file path detected - {enqueued_file.name}'
|
|
logger.warning(security_msg)
|
|
|
|
if deleted_files == []:
|
|
file_error_msg = f'File deletion skipped, missing or unsafe file: {result.file_path}'
|
|
logger.warning(file_error_msg)
|
|
async with pipeline_status_lock:
|
|
pipeline_status['latest_message'] = file_error_msg
|
|
pipeline_status['history_messages'].append(file_error_msg)
|
|
|
|
except Exception as file_error:
|
|
file_error_msg = f'Failed to delete file {result.file_path}: {file_error!s}'
|
|
logger.error(file_error_msg)
|
|
async with pipeline_status_lock:
|
|
pipeline_status['latest_message'] = file_error_msg
|
|
pipeline_status['history_messages'].append(file_error_msg)
|
|
elif delete_file:
|
|
no_file_msg = f'File deletion skipped, missing file path: {doc_id}'
|
|
logger.warning(no_file_msg)
|
|
async with pipeline_status_lock:
|
|
pipeline_status['latest_message'] = no_file_msg
|
|
pipeline_status['history_messages'].append(no_file_msg)
|
|
else:
|
|
failed_deletions.append(doc_id)
|
|
error_msg = f'Failed to delete {i}/{total_docs}: {doc_id}[{file_path}] - {result.message}'
|
|
logger.error(error_msg)
|
|
async with pipeline_status_lock:
|
|
pipeline_status['latest_message'] = error_msg
|
|
pipeline_status['history_messages'].append(error_msg)
|
|
|
|
except Exception as e:
|
|
failed_deletions.append(doc_id)
|
|
error_msg = f'Error deleting document {i}/{total_docs}: {doc_id}[{file_path}] - {e!s}'
|
|
logger.error(error_msg)
|
|
logger.error(traceback.format_exc())
|
|
async with pipeline_status_lock:
|
|
pipeline_status['latest_message'] = error_msg
|
|
pipeline_status['history_messages'].append(error_msg)
|
|
|
|
except Exception as e:
|
|
error_msg = f'Critical error during batch deletion: {e!s}'
|
|
logger.error(error_msg)
|
|
logger.error(traceback.format_exc())
|
|
async with pipeline_status_lock:
|
|
pipeline_status['history_messages'].append(error_msg)
|
|
finally:
|
|
# Final summary and check for pending requests
|
|
async with pipeline_status_lock:
|
|
pipeline_status['busy'] = False
|
|
pipeline_status['pending_requests'] = False # Reset pending requests flag
|
|
pipeline_status['cancellation_requested'] = False # Always reset cancellation flag
|
|
completion_msg = (
|
|
f'Deletion completed: {len(successful_deletions)} successful, {len(failed_deletions)} failed'
|
|
)
|
|
pipeline_status['latest_message'] = completion_msg
|
|
pipeline_status['history_messages'].append(completion_msg)
|
|
|
|
# Check if there are pending document indexing requests
|
|
has_pending_request = pipeline_status.get('request_pending', False)
|
|
|
|
# If there are pending requests, start document processing pipeline
|
|
if has_pending_request:
|
|
try:
|
|
logger.info('Processing pending document indexing requests after deletion')
|
|
await rag.apipeline_process_enqueue_documents()
|
|
except Exception as e:
|
|
logger.error(f'Error processing pending documents after deletion: {e}')
|
|
|
|
|
|
def create_document_routes(rag: LightRAG, doc_manager: DocumentManager, api_key: str | None = None):
|
|
# Create combined auth dependency for document routes
|
|
combined_auth = get_combined_auth_dependency(api_key)
|
|
|
|
@router.post('/scan', response_model=ScanResponse, dependencies=[Depends(combined_auth)])
|
|
async def scan_for_new_documents(background_tasks: BackgroundTasks):
|
|
"""
|
|
Trigger the scanning process for new documents.
|
|
|
|
This endpoint initiates a background task that scans the input directory for new documents
|
|
and processes them. If a scanning process is already running, it returns a status indicating
|
|
that fact.
|
|
|
|
Returns:
|
|
ScanResponse: A response object containing the scanning status and track_id
|
|
"""
|
|
# Generate track_id with "scan" prefix for scanning operation
|
|
track_id = generate_track_id('scan')
|
|
|
|
# Start the scanning process in the background with track_id
|
|
background_tasks.add_task(run_scanning_process, rag, doc_manager, track_id)
|
|
return ScanResponse(
|
|
status='scanning_started',
|
|
message='Scanning process has been initiated in the background',
|
|
track_id=track_id,
|
|
)
|
|
|
|
@router.post('/upload', response_model=InsertResponse, dependencies=[Depends(combined_auth)])
|
|
async def upload_to_input_dir(
|
|
background_tasks: BackgroundTasks,
|
|
file: Annotated[UploadFile, File(...)],
|
|
):
|
|
"""
|
|
Upload a file to the input directory and index it.
|
|
|
|
This API endpoint accepts a file through an HTTP POST request, checks if the
|
|
uploaded file is of a supported type, saves it in the specified input directory,
|
|
indexes it for retrieval, and returns a success status with relevant details.
|
|
|
|
Args:
|
|
background_tasks: FastAPI BackgroundTasks for async processing
|
|
file (UploadFile): The file to be uploaded. It must have an allowed extension.
|
|
|
|
Returns:
|
|
InsertResponse: A response object containing the upload status and a message.
|
|
status can be "success", "duplicated", or error is thrown.
|
|
|
|
Raises:
|
|
HTTPException: If the file type is not supported (400) or other errors occur (500).
|
|
"""
|
|
try:
|
|
# Sanitize filename to prevent Path Traversal attacks
|
|
safe_filename = sanitize_filename(file.filename or '', doc_manager.input_dir)
|
|
|
|
if not doc_manager.is_supported_file(safe_filename):
|
|
raise HTTPException(
|
|
status_code=400,
|
|
detail=f'Unsupported file type. Supported types: {doc_manager.supported_extensions}',
|
|
)
|
|
|
|
# Check if filename already exists in doc_status storage
|
|
existing_doc_data = await rag.doc_status.get_doc_by_file_path(safe_filename)
|
|
if existing_doc_data:
|
|
# Get document status and track_id from existing document
|
|
status = existing_doc_data.get('status', 'unknown')
|
|
# Use `or ""` to handle both missing key and None value (e.g., legacy rows without track_id)
|
|
existing_track_id = existing_doc_data.get('track_id') or ''
|
|
return InsertResponse(
|
|
status='duplicated',
|
|
message=f"File '{safe_filename}' already exists in document storage (Status: {status}).",
|
|
track_id=existing_track_id,
|
|
)
|
|
|
|
file_path = doc_manager.input_dir / safe_filename
|
|
# Check if file already exists in file system
|
|
if file_path.exists():
|
|
return InsertResponse(
|
|
status='duplicated',
|
|
message=f"File '{safe_filename}' already exists in the input directory.",
|
|
track_id='',
|
|
)
|
|
|
|
with open(file_path, 'wb') as buffer:
|
|
shutil.copyfileobj(file.file, buffer)
|
|
|
|
track_id = generate_track_id('upload')
|
|
|
|
# Add to background tasks and get track_id
|
|
background_tasks.add_task(pipeline_index_file, rag, file_path, track_id)
|
|
|
|
return InsertResponse(
|
|
status='success',
|
|
message=f"File '{safe_filename}' uploaded successfully. Processing will continue in background.",
|
|
track_id=track_id,
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f'Error /documents/upload: {file.filename}: {e!s}')
|
|
logger.error(traceback.format_exc())
|
|
raise HTTPException(status_code=500, detail=str(e)) from e
|
|
|
|
@router.post('/text', response_model=InsertResponse, dependencies=[Depends(combined_auth)])
|
|
async def insert_text(request: InsertTextRequest, background_tasks: BackgroundTasks):
|
|
"""
|
|
Insert text into the RAG system.
|
|
|
|
This endpoint allows you to insert text data into the RAG system for later retrieval
|
|
and use in generating responses.
|
|
|
|
Args:
|
|
request (InsertTextRequest): The request body containing the text to be inserted.
|
|
background_tasks: FastAPI BackgroundTasks for async processing
|
|
|
|
Returns:
|
|
InsertResponse: A response object containing the status of the operation.
|
|
|
|
Raises:
|
|
HTTPException: If an error occurs during text processing (500).
|
|
"""
|
|
try:
|
|
# Check if file_source already exists in doc_status storage
|
|
if request.file_source and request.file_source.strip() and request.file_source != 'unknown_source':
|
|
existing_doc_data = await rag.doc_status.get_doc_by_file_path(request.file_source)
|
|
if existing_doc_data:
|
|
# Get document status and track_id from existing document
|
|
status = existing_doc_data.get('status', 'unknown')
|
|
# Use `or ""` to handle both missing key and None value (e.g., legacy rows without track_id)
|
|
existing_track_id = existing_doc_data.get('track_id') or ''
|
|
return InsertResponse(
|
|
status='duplicated',
|
|
message=f"File source '{request.file_source}' already exists in document storage (Status: {status}).",
|
|
track_id=existing_track_id,
|
|
)
|
|
|
|
# Check if content already exists by computing content hash (doc_id)
|
|
sanitized_text = sanitize_text_for_encoding(request.text)
|
|
content_doc_id = compute_mdhash_id(sanitized_text, prefix='doc-')
|
|
existing_doc = await rag.doc_status.get_by_id(content_doc_id)
|
|
if existing_doc:
|
|
# Content already exists, return duplicated with existing track_id
|
|
status = existing_doc.get('status', 'unknown')
|
|
existing_track_id = existing_doc.get('track_id') or ''
|
|
return InsertResponse(
|
|
status='duplicated',
|
|
message=f'Identical content already exists in document storage (doc_id: {content_doc_id}, Status: {status}).',
|
|
track_id=existing_track_id,
|
|
)
|
|
|
|
# Generate track_id for text insertion
|
|
track_id = generate_track_id('insert')
|
|
|
|
background_tasks.add_task(
|
|
pipeline_index_texts,
|
|
rag,
|
|
[request.text],
|
|
file_sources=[request.file_source or 'unknown_source'],
|
|
track_id=track_id,
|
|
)
|
|
|
|
return InsertResponse(
|
|
status='success',
|
|
message='Text successfully received. Processing will continue in background.',
|
|
track_id=track_id,
|
|
)
|
|
except Exception as e:
|
|
logger.error(f'Error /documents/text: {e!s}')
|
|
logger.error(traceback.format_exc())
|
|
raise HTTPException(status_code=500, detail=str(e)) from e
|
|
|
|
@router.post(
|
|
'/texts',
|
|
response_model=InsertResponse,
|
|
dependencies=[Depends(combined_auth)],
|
|
)
|
|
async def insert_texts(request: InsertTextsRequest, background_tasks: BackgroundTasks):
|
|
"""
|
|
Insert multiple texts into the RAG system.
|
|
|
|
This endpoint allows you to insert multiple text entries into the RAG system
|
|
in a single request.
|
|
|
|
Note:
|
|
If any text content or file_source already exists in the system,
|
|
the entire batch will be rejected with status "duplicated".
|
|
|
|
Args:
|
|
request (InsertTextsRequest): The request body containing the list of texts.
|
|
background_tasks: FastAPI BackgroundTasks for async processing
|
|
|
|
Returns:
|
|
InsertResponse: A response object containing the status of the operation.
|
|
|
|
Raises:
|
|
HTTPException: If an error occurs during text processing (500).
|
|
"""
|
|
try:
|
|
# Check if any file_sources already exist in doc_status storage
|
|
if request.file_sources:
|
|
for file_source in request.file_sources:
|
|
if file_source and file_source.strip() and file_source != 'unknown_source':
|
|
existing_doc_data = await rag.doc_status.get_doc_by_file_path(file_source)
|
|
if existing_doc_data:
|
|
# Get document status and track_id from existing document
|
|
status = existing_doc_data.get('status', 'unknown')
|
|
# Use `or ""` to handle both missing key and None value (e.g., legacy rows without track_id)
|
|
existing_track_id = existing_doc_data.get('track_id') or ''
|
|
return InsertResponse(
|
|
status='duplicated',
|
|
message=f"File source '{file_source}' already exists in document storage (Status: {status}).",
|
|
track_id=existing_track_id,
|
|
)
|
|
|
|
# Check if any content already exists by computing content hash (doc_id)
|
|
for text in request.texts:
|
|
sanitized_text = sanitize_text_for_encoding(text)
|
|
content_doc_id = compute_mdhash_id(sanitized_text, prefix='doc-')
|
|
existing_doc = await rag.doc_status.get_by_id(content_doc_id)
|
|
if existing_doc:
|
|
# Content already exists, return duplicated with existing track_id
|
|
status = existing_doc.get('status', 'unknown')
|
|
existing_track_id = existing_doc.get('track_id') or ''
|
|
return InsertResponse(
|
|
status='duplicated',
|
|
message=f'Identical content already exists in document storage (doc_id: {content_doc_id}, Status: {status}).',
|
|
track_id=existing_track_id,
|
|
)
|
|
|
|
# Generate track_id for texts insertion
|
|
track_id = generate_track_id('insert')
|
|
|
|
background_tasks.add_task(
|
|
pipeline_index_texts,
|
|
rag,
|
|
request.texts,
|
|
file_sources=request.file_sources,
|
|
track_id=track_id,
|
|
)
|
|
|
|
return InsertResponse(
|
|
status='success',
|
|
message='Texts successfully received. Processing will continue in background.',
|
|
track_id=track_id,
|
|
)
|
|
except Exception as e:
|
|
logger.error(f'Error /documents/texts: {e!s}')
|
|
logger.error(traceback.format_exc())
|
|
raise HTTPException(status_code=500, detail=str(e)) from e
|
|
|
|
@router.delete('', response_model=ClearDocumentsResponse, dependencies=[Depends(combined_auth)])
|
|
async def clear_documents():
|
|
"""
|
|
Clear all documents from the RAG system.
|
|
|
|
This endpoint deletes all documents, entities, relationships, and files from the system.
|
|
It uses the storage drop methods to properly clean up all data and removes all files
|
|
from the input directory.
|
|
|
|
Returns:
|
|
ClearDocumentsResponse: A response object containing the status and message.
|
|
- status="success": All documents and files were successfully cleared.
|
|
- status="partial_success": Document clear job exit with some errors.
|
|
- status="busy": Operation could not be completed because the pipeline is busy.
|
|
- status="fail": All storage drop operations failed, with message
|
|
- message: Detailed information about the operation results, including counts
|
|
of deleted files and any errors encountered.
|
|
|
|
Raises:
|
|
HTTPException: Raised when a serious error occurs during the clearing process,
|
|
with status code 500 and error details in the detail field.
|
|
"""
|
|
from lightrag.kg.shared_storage import (
|
|
get_namespace_data,
|
|
get_namespace_lock,
|
|
)
|
|
|
|
# Get pipeline status and lock
|
|
pipeline_status = await get_namespace_data('pipeline_status', workspace=rag.workspace)
|
|
pipeline_status_lock = get_namespace_lock('pipeline_status', workspace=rag.workspace)
|
|
|
|
# Check and set status with lock
|
|
async with pipeline_status_lock:
|
|
if pipeline_status.get('busy', False):
|
|
return ClearDocumentsResponse(
|
|
status='busy',
|
|
message='Cannot clear documents while pipeline is busy',
|
|
)
|
|
# Set busy to true
|
|
pipeline_status.update(
|
|
{
|
|
'busy': True,
|
|
'job_name': 'Clearing Documents',
|
|
'job_start': datetime.now().isoformat(),
|
|
'docs': 0,
|
|
'batchs': 0,
|
|
'cur_batch': 0,
|
|
'request_pending': False, # Clear any previous request
|
|
'latest_message': 'Starting document clearing process',
|
|
}
|
|
)
|
|
# Cleaning history_messages without breaking it as a shared list object
|
|
del pipeline_status['history_messages'][:]
|
|
pipeline_status['history_messages'].append('Starting document clearing process')
|
|
|
|
try:
|
|
# Use drop method to clear all data
|
|
drop_tasks = []
|
|
storages = [
|
|
rag.text_chunks,
|
|
rag.full_docs,
|
|
rag.full_entities,
|
|
rag.full_relations,
|
|
rag.entity_chunks,
|
|
rag.relation_chunks,
|
|
rag.entities_vdb,
|
|
rag.relationships_vdb,
|
|
rag.chunks_vdb,
|
|
rag.chunk_entity_relation_graph,
|
|
rag.doc_status,
|
|
]
|
|
|
|
# Log storage drop start
|
|
if 'history_messages' in pipeline_status:
|
|
pipeline_status['history_messages'].append('Starting to drop storage components')
|
|
|
|
for storage in storages:
|
|
if storage is not None:
|
|
drop_tasks.append(storage.drop())
|
|
|
|
# Wait for all drop tasks to complete
|
|
drop_results = await asyncio.gather(*drop_tasks, return_exceptions=True)
|
|
|
|
# Check for errors and log results
|
|
errors = []
|
|
storage_success_count = 0
|
|
storage_error_count = 0
|
|
|
|
for i, result in enumerate(drop_results):
|
|
storage_name = storages[i].__class__.__name__
|
|
if isinstance(result, Exception):
|
|
error_msg = f'Error dropping {storage_name}: {result!s}'
|
|
errors.append(error_msg)
|
|
logger.error(error_msg)
|
|
storage_error_count += 1
|
|
else:
|
|
namespace = storages[i].namespace
|
|
workspace = storages[i].workspace
|
|
logger.info(f'Successfully dropped {storage_name}: {workspace}/{namespace}')
|
|
storage_success_count += 1
|
|
|
|
# Log storage drop results
|
|
if 'history_messages' in pipeline_status:
|
|
if storage_error_count > 0:
|
|
pipeline_status['history_messages'].append(
|
|
f'Dropped {storage_success_count} storage components with {storage_error_count} errors'
|
|
)
|
|
else:
|
|
pipeline_status['history_messages'].append(
|
|
f'Successfully dropped all {storage_success_count} storage components'
|
|
)
|
|
|
|
# If all storage operations failed, return error status and don't proceed with file deletion
|
|
if storage_success_count == 0 and storage_error_count > 0:
|
|
error_message = 'All storage drop operations failed. Aborting document clearing process.'
|
|
logger.error(error_message)
|
|
if 'history_messages' in pipeline_status:
|
|
pipeline_status['history_messages'].append(error_message)
|
|
return ClearDocumentsResponse(status='fail', message=error_message)
|
|
|
|
# Log file deletion start
|
|
if 'history_messages' in pipeline_status:
|
|
pipeline_status['history_messages'].append('Starting to delete files in input directory')
|
|
|
|
# Delete only files in the current directory, preserve files in subdirectories
|
|
deleted_files_count = 0
|
|
file_errors_count = 0
|
|
|
|
for file_path in doc_manager.input_dir.glob('*'):
|
|
if file_path.is_file():
|
|
try:
|
|
file_path.unlink()
|
|
deleted_files_count += 1
|
|
except Exception as e:
|
|
logger.error(f'Error deleting file {file_path}: {e!s}')
|
|
file_errors_count += 1
|
|
|
|
# Log file deletion results
|
|
if 'history_messages' in pipeline_status:
|
|
if file_errors_count > 0:
|
|
pipeline_status['history_messages'].append(
|
|
f'Deleted {deleted_files_count} files with {file_errors_count} errors'
|
|
)
|
|
errors.append(f'Failed to delete {file_errors_count} files')
|
|
else:
|
|
pipeline_status['history_messages'].append(f'Successfully deleted {deleted_files_count} files')
|
|
|
|
# Prepare final result message
|
|
final_message = ''
|
|
if errors:
|
|
final_message = f'Cleared documents with some errors. Deleted {deleted_files_count} files.'
|
|
status = 'partial_success'
|
|
else:
|
|
final_message = f'All documents cleared successfully. Deleted {deleted_files_count} files.'
|
|
status = 'success'
|
|
|
|
# Log final result
|
|
if 'history_messages' in pipeline_status:
|
|
pipeline_status['history_messages'].append(final_message)
|
|
|
|
# Return response based on results
|
|
return ClearDocumentsResponse(status=status, message=final_message)
|
|
except Exception as e:
|
|
error_msg = f'Error clearing documents: {e!s}'
|
|
logger.error(error_msg)
|
|
logger.error(traceback.format_exc())
|
|
if 'history_messages' in pipeline_status:
|
|
pipeline_status['history_messages'].append(error_msg)
|
|
raise HTTPException(status_code=500, detail=str(e)) from e
|
|
finally:
|
|
# Reset busy status after completion
|
|
async with pipeline_status_lock:
|
|
pipeline_status['busy'] = False
|
|
completion_msg = 'Document clearing process completed'
|
|
pipeline_status['latest_message'] = completion_msg
|
|
if 'history_messages' in pipeline_status:
|
|
pipeline_status['history_messages'].append(completion_msg)
|
|
|
|
@router.get(
|
|
'/pipeline_status',
|
|
dependencies=[Depends(combined_auth)],
|
|
response_model=PipelineStatusResponse,
|
|
)
|
|
async def get_pipeline_status() -> PipelineStatusResponse:
|
|
"""
|
|
Get the current status of the document indexing pipeline.
|
|
|
|
This endpoint returns information about the current state of the document processing pipeline,
|
|
including the processing status, progress information, and history messages.
|
|
|
|
Returns:
|
|
PipelineStatusResponse: A response object containing:
|
|
- autoscanned (bool): Whether auto-scan has started
|
|
- busy (bool): Whether the pipeline is currently busy
|
|
- job_name (str): Current job name (e.g., indexing files/indexing texts)
|
|
- job_start (str, optional): Job start time as ISO format string
|
|
- docs (int): Total number of documents to be indexed
|
|
- batchs (int): Number of batches for processing documents
|
|
- cur_batch (int): Current processing batch
|
|
- request_pending (bool): Flag for pending request for processing
|
|
- latest_message (str): Latest message from pipeline processing
|
|
- history_messages (List[str], optional): List of history messages (limited to latest 1000 entries,
|
|
with truncation message if more than 1000 messages exist)
|
|
|
|
Raises:
|
|
HTTPException: If an error occurs while retrieving pipeline status (500)
|
|
"""
|
|
try:
|
|
from lightrag.kg.shared_storage import (
|
|
get_all_update_flags_status,
|
|
get_namespace_data,
|
|
get_namespace_lock,
|
|
)
|
|
|
|
pipeline_status = await get_namespace_data('pipeline_status', workspace=rag.workspace)
|
|
pipeline_status_lock = get_namespace_lock('pipeline_status', workspace=rag.workspace)
|
|
|
|
# Get update flags status for all namespaces
|
|
update_status = await get_all_update_flags_status(workspace=rag.workspace)
|
|
|
|
# Convert MutableBoolean objects to regular boolean values
|
|
processed_update_status = {}
|
|
for namespace, flags in update_status.items():
|
|
processed_flags = []
|
|
for flag in flags:
|
|
# Handle both multiprocess and single process cases
|
|
if hasattr(flag, 'value'):
|
|
processed_flags.append(bool(flag.value))
|
|
else:
|
|
processed_flags.append(bool(flag))
|
|
processed_update_status[namespace] = processed_flags
|
|
|
|
async with pipeline_status_lock:
|
|
# Convert to regular dict if it's a Manager.dict
|
|
status_dict = dict(pipeline_status)
|
|
|
|
# Add processed update_status to the status dictionary
|
|
status_dict['update_status'] = processed_update_status
|
|
|
|
# Convert history_messages to a regular list if it's a Manager.list
|
|
# and limit to latest 1000 entries with truncation message if needed
|
|
if 'history_messages' in status_dict:
|
|
history_list = list(status_dict['history_messages'])
|
|
total_count = len(history_list)
|
|
|
|
if total_count > 1000:
|
|
# Calculate truncated message count
|
|
truncated_count = total_count - 1000
|
|
|
|
# Take only the latest 1000 messages
|
|
latest_messages = history_list[-1000:]
|
|
|
|
# Add truncation message at the beginning
|
|
truncation_message = f'[Truncated history messages: {truncated_count}/{total_count}]'
|
|
status_dict['history_messages'] = [truncation_message, *latest_messages]
|
|
else:
|
|
# No truncation needed, return all messages
|
|
status_dict['history_messages'] = history_list
|
|
|
|
# Ensure job_start is properly formatted as a string with timezone information
|
|
if status_dict.get('job_start'):
|
|
# Use format_datetime to ensure consistent formatting
|
|
status_dict['job_start'] = format_datetime(status_dict['job_start'])
|
|
|
|
return PipelineStatusResponse(**status_dict)
|
|
except Exception as e:
|
|
logger.error(f'Error getting pipeline status: {e!s}')
|
|
logger.error(traceback.format_exc())
|
|
raise HTTPException(status_code=500, detail=str(e)) from e
|
|
|
|
# TODO: Deprecated, use /documents/paginated instead
|
|
@router.get('', response_model=DocsStatusesResponse, dependencies=[Depends(combined_auth)])
|
|
async def documents() -> DocsStatusesResponse:
|
|
"""
|
|
Get the status of all documents in the system. This endpoint is deprecated; use /documents/paginated instead.
|
|
To prevent excessive resource consumption, a maximum of 1,000 records is returned.
|
|
|
|
This endpoint retrieves the current status of all documents, grouped by their
|
|
processing status (PENDING, PROCESSING, PREPROCESSED, PROCESSED, FAILED). The results are
|
|
limited to 1000 total documents with fair distribution across all statuses.
|
|
|
|
Returns:
|
|
DocsStatusesResponse: A response object containing a dictionary where keys are
|
|
DocStatus values and values are lists of DocStatusResponse
|
|
objects representing documents in each status category.
|
|
Maximum 1000 documents total will be returned.
|
|
|
|
Raises:
|
|
HTTPException: If an error occurs while retrieving document statuses (500).
|
|
"""
|
|
try:
|
|
statuses = (
|
|
DocStatus.PENDING,
|
|
DocStatus.PROCESSING,
|
|
DocStatus.PREPROCESSED,
|
|
DocStatus.PROCESSED,
|
|
DocStatus.FAILED,
|
|
)
|
|
|
|
tasks = [rag.get_docs_by_status(status) for status in statuses]
|
|
results: list[dict[str, DocProcessingStatus]] = await asyncio.gather(*tasks)
|
|
|
|
response = DocsStatusesResponse()
|
|
total_documents = 0
|
|
max_documents = 1000
|
|
|
|
# Convert results to lists for easier processing
|
|
status_documents = []
|
|
for idx, result in enumerate(results):
|
|
status = statuses[idx]
|
|
docs_list = []
|
|
for doc_id, doc_status in result.items():
|
|
docs_list.append((doc_id, doc_status))
|
|
status_documents.append((status, docs_list))
|
|
|
|
# Fair distribution: round-robin across statuses
|
|
status_indices = [0] * len(status_documents) # Track current index for each status
|
|
current_status_idx = 0
|
|
|
|
while total_documents < max_documents:
|
|
# Check if we have any documents left to process
|
|
has_remaining = False
|
|
for status_idx, (_status, docs_list) in enumerate(status_documents):
|
|
if status_indices[status_idx] < len(docs_list):
|
|
has_remaining = True
|
|
break
|
|
|
|
if not has_remaining:
|
|
break
|
|
|
|
# Try to get a document from the current status
|
|
status, docs_list = status_documents[current_status_idx]
|
|
current_index = status_indices[current_status_idx]
|
|
|
|
if current_index < len(docs_list):
|
|
doc_id, doc_status = docs_list[current_index]
|
|
|
|
if status not in response.statuses:
|
|
response.statuses[status] = []
|
|
|
|
response.statuses[status].append(
|
|
DocStatusResponse(
|
|
id=doc_id,
|
|
content_summary=doc_status.content_summary,
|
|
content_length=doc_status.content_length,
|
|
status=doc_status.status,
|
|
created_at=format_datetime(doc_status.created_at),
|
|
updated_at=format_datetime(doc_status.updated_at),
|
|
track_id=doc_status.track_id,
|
|
chunks_count=doc_status.chunks_count,
|
|
error_msg=doc_status.error_msg,
|
|
metadata=doc_status.metadata,
|
|
file_path=doc_status.file_path,
|
|
s3_key=doc_status.s3_key,
|
|
)
|
|
)
|
|
|
|
status_indices[current_status_idx] += 1
|
|
total_documents += 1
|
|
|
|
# Move to next status (round-robin)
|
|
current_status_idx = (current_status_idx + 1) % len(status_documents)
|
|
|
|
return response
|
|
except Exception as e:
|
|
logger.error(f'Error GET /documents: {e!s}')
|
|
logger.error(traceback.format_exc())
|
|
raise HTTPException(status_code=500, detail=str(e)) from e
|
|
|
|
class DeleteDocByIdResponse(BaseModel):
|
|
"""Response model for single document deletion operation."""
|
|
|
|
status: Literal['deletion_started', 'busy', 'not_allowed'] = Field(
|
|
description='Status of the deletion operation'
|
|
)
|
|
message: str = Field(description='Message describing the operation result')
|
|
doc_id: str = Field(description='The ID of the document to delete')
|
|
|
|
@router.delete(
|
|
'/delete_document',
|
|
response_model=DeleteDocByIdResponse,
|
|
dependencies=[Depends(combined_auth)],
|
|
summary='Delete a document and all its associated data by its ID.',
|
|
)
|
|
async def delete_document(
|
|
delete_request: DeleteDocRequest,
|
|
background_tasks: BackgroundTasks,
|
|
) -> DeleteDocByIdResponse:
|
|
"""
|
|
Delete documents and all their associated data by their IDs using background processing.
|
|
|
|
Deletes specific documents and all their associated data, including their status,
|
|
text chunks, vector embeddings, and any related graph data. When requested,
|
|
cached LLM extraction responses are removed after graph deletion/rebuild completes.
|
|
The deletion process runs in the background to avoid blocking the client connection.
|
|
|
|
This operation is irreversible and will interact with the pipeline status.
|
|
|
|
Args:
|
|
delete_request (DeleteDocRequest): The request containing the document IDs and deletion options.
|
|
background_tasks: FastAPI BackgroundTasks for async processing
|
|
|
|
Returns:
|
|
DeleteDocByIdResponse: The result of the deletion operation.
|
|
- status="deletion_started": The document deletion has been initiated in the background.
|
|
- status="busy": The pipeline is busy with another operation.
|
|
|
|
Raises:
|
|
HTTPException:
|
|
- 500: If an unexpected internal error occurs during initialization.
|
|
"""
|
|
doc_ids = delete_request.doc_ids
|
|
|
|
try:
|
|
from lightrag.kg.shared_storage import (
|
|
get_namespace_data,
|
|
get_namespace_lock,
|
|
)
|
|
|
|
pipeline_status = await get_namespace_data('pipeline_status', workspace=rag.workspace)
|
|
pipeline_status_lock = get_namespace_lock('pipeline_status', workspace=rag.workspace)
|
|
|
|
# Check if pipeline is busy with proper lock
|
|
async with pipeline_status_lock:
|
|
if pipeline_status.get('busy', False):
|
|
return DeleteDocByIdResponse(
|
|
status='busy',
|
|
message='Cannot delete documents while pipeline is busy',
|
|
doc_id=', '.join(doc_ids),
|
|
)
|
|
|
|
# Add deletion task to background tasks
|
|
background_tasks.add_task(
|
|
background_delete_documents,
|
|
rag,
|
|
doc_manager,
|
|
doc_ids,
|
|
delete_request.delete_file,
|
|
delete_request.delete_llm_cache,
|
|
)
|
|
|
|
return DeleteDocByIdResponse(
|
|
status='deletion_started',
|
|
message=f'Document deletion for {len(doc_ids)} documents has been initiated. Processing will continue in background.',
|
|
doc_id=', '.join(doc_ids),
|
|
)
|
|
|
|
except Exception as e:
|
|
error_msg = f'Error initiating document deletion for {delete_request.doc_ids}: {e!s}'
|
|
logger.error(error_msg)
|
|
logger.error(traceback.format_exc())
|
|
raise HTTPException(status_code=500, detail=error_msg) from e
|
|
|
|
@router.post(
|
|
'/clear_cache',
|
|
response_model=ClearCacheResponse,
|
|
dependencies=[Depends(combined_auth)],
|
|
)
|
|
async def clear_cache(request: ClearCacheRequest):
|
|
"""
|
|
Clear all cache data from the LLM response cache storage.
|
|
|
|
This endpoint clears all cached LLM responses regardless of mode.
|
|
The request body is accepted for API compatibility but is ignored.
|
|
|
|
Args:
|
|
request (ClearCacheRequest): The request body (ignored for compatibility).
|
|
|
|
Returns:
|
|
ClearCacheResponse: A response object containing the status and message.
|
|
|
|
Raises:
|
|
HTTPException: If an error occurs during cache clearing (500).
|
|
"""
|
|
try:
|
|
# Call the aclear_cache method (no modes parameter)
|
|
await rag.aclear_cache()
|
|
|
|
# Prepare success message
|
|
message = 'Successfully cleared all cache'
|
|
|
|
return ClearCacheResponse(status='success', message=message)
|
|
except Exception as e:
|
|
logger.error(f'Error clearing cache: {e!s}')
|
|
logger.error(traceback.format_exc())
|
|
raise HTTPException(status_code=500, detail=str(e)) from e
|
|
|
|
@router.delete(
|
|
'/delete_entity',
|
|
response_model=DeletionResult,
|
|
dependencies=[Depends(combined_auth)],
|
|
)
|
|
async def delete_entity(request: DeleteEntityRequest):
|
|
"""
|
|
Delete an entity and all its relationships from the knowledge graph.
|
|
|
|
Args:
|
|
request (DeleteEntityRequest): The request body containing the entity name.
|
|
|
|
Returns:
|
|
DeletionResult: An object containing the outcome of the deletion process.
|
|
|
|
Raises:
|
|
HTTPException: If the entity is not found (404) or an error occurs (500).
|
|
"""
|
|
try:
|
|
result = await rag.adelete_by_entity(entity_name=request.entity_name)
|
|
if result.status == 'not_found':
|
|
raise HTTPException(status_code=404, detail=result.message)
|
|
if result.status == 'fail':
|
|
raise HTTPException(status_code=500, detail=result.message)
|
|
# Set doc_id to empty string since this is an entity operation, not document
|
|
result.doc_id = ''
|
|
return result
|
|
except HTTPException:
|
|
raise
|
|
except Exception as e:
|
|
error_msg = f"Error deleting entity '{request.entity_name}': {e!s}"
|
|
logger.error(error_msg)
|
|
logger.error(traceback.format_exc())
|
|
raise HTTPException(status_code=500, detail=error_msg) from e
|
|
|
|
@router.delete(
|
|
'/delete_relation',
|
|
response_model=DeletionResult,
|
|
dependencies=[Depends(combined_auth)],
|
|
)
|
|
async def delete_relation(request: DeleteRelationRequest):
|
|
"""
|
|
Delete a relationship between two entities from the knowledge graph.
|
|
|
|
Args:
|
|
request (DeleteRelationRequest): The request body containing the source and target entity names.
|
|
|
|
Returns:
|
|
DeletionResult: An object containing the outcome of the deletion process.
|
|
|
|
Raises:
|
|
HTTPException: If the relation is not found (404) or an error occurs (500).
|
|
"""
|
|
try:
|
|
result = await rag.adelete_by_relation(
|
|
source_entity=request.source_entity,
|
|
target_entity=request.target_entity,
|
|
)
|
|
if result.status == 'not_found':
|
|
raise HTTPException(status_code=404, detail=result.message)
|
|
if result.status == 'fail':
|
|
raise HTTPException(status_code=500, detail=result.message)
|
|
# Set doc_id to empty string since this is a relation operation, not document
|
|
result.doc_id = ''
|
|
return result
|
|
except HTTPException:
|
|
raise
|
|
except Exception as e:
|
|
error_msg = f"Error deleting relation from '{request.source_entity}' to '{request.target_entity}': {e!s}"
|
|
logger.error(error_msg)
|
|
logger.error(traceback.format_exc())
|
|
raise HTTPException(status_code=500, detail=error_msg) from e
|
|
|
|
@router.get(
|
|
'/track_status/{track_id}',
|
|
response_model=TrackStatusResponse,
|
|
dependencies=[Depends(combined_auth)],
|
|
)
|
|
async def get_track_status(track_id: str) -> TrackStatusResponse:
|
|
"""
|
|
Get the processing status of documents by tracking ID.
|
|
|
|
This endpoint retrieves all documents associated with a specific tracking ID,
|
|
allowing users to monitor the processing progress of their uploaded files or inserted texts.
|
|
|
|
Args:
|
|
track_id (str): The tracking ID returned from upload, text, or texts endpoints
|
|
|
|
Returns:
|
|
TrackStatusResponse: A response object containing:
|
|
- track_id: The tracking ID
|
|
- documents: List of documents associated with this track_id
|
|
- total_count: Total number of documents for this track_id
|
|
|
|
Raises:
|
|
HTTPException: If track_id is invalid (400) or an error occurs (500).
|
|
"""
|
|
try:
|
|
# Validate track_id
|
|
if not track_id or not track_id.strip():
|
|
raise HTTPException(status_code=400, detail='Track ID cannot be empty')
|
|
|
|
track_id = track_id.strip()
|
|
|
|
# Get documents by track_id
|
|
docs_by_track_id = await rag.aget_docs_by_track_id(track_id)
|
|
|
|
# Convert to response format
|
|
documents = []
|
|
status_summary = {}
|
|
|
|
for doc_id, doc_status in docs_by_track_id.items():
|
|
documents.append(
|
|
DocStatusResponse(
|
|
id=doc_id,
|
|
content_summary=doc_status.content_summary,
|
|
content_length=doc_status.content_length,
|
|
status=doc_status.status,
|
|
created_at=format_datetime(doc_status.created_at),
|
|
updated_at=format_datetime(doc_status.updated_at),
|
|
track_id=doc_status.track_id,
|
|
chunks_count=doc_status.chunks_count,
|
|
error_msg=doc_status.error_msg,
|
|
metadata=doc_status.metadata,
|
|
file_path=doc_status.file_path,
|
|
s3_key=doc_status.s3_key,
|
|
)
|
|
)
|
|
|
|
# Build status summary
|
|
# Handle both DocStatus enum and string cases for robust deserialization
|
|
status_key = str(doc_status.status)
|
|
status_summary[status_key] = status_summary.get(status_key, 0) + 1
|
|
|
|
return TrackStatusResponse(
|
|
track_id=track_id,
|
|
documents=documents,
|
|
total_count=len(documents),
|
|
status_summary=status_summary,
|
|
)
|
|
|
|
except HTTPException:
|
|
raise
|
|
except Exception as e:
|
|
logger.error(f'Error getting track status for {track_id}: {e!s}')
|
|
logger.error(traceback.format_exc())
|
|
raise HTTPException(status_code=500, detail=str(e)) from e
|
|
|
|
@router.post(
|
|
'/paginated',
|
|
response_model=PaginatedDocsResponse,
|
|
dependencies=[Depends(combined_auth)],
|
|
)
|
|
async def get_documents_paginated(
|
|
request: DocumentsRequest,
|
|
) -> PaginatedDocsResponse:
|
|
"""
|
|
Get documents with pagination support.
|
|
|
|
This endpoint retrieves documents with pagination, filtering, and sorting capabilities.
|
|
It provides better performance for large document collections by loading only the
|
|
requested page of data.
|
|
|
|
Args:
|
|
request (DocumentsRequest): The request body containing pagination parameters
|
|
|
|
Returns:
|
|
PaginatedDocsResponse: A response object containing:
|
|
- documents: List of documents for the current page
|
|
- pagination: Pagination information (page, total_count, etc.)
|
|
- status_counts: Count of documents by status for all documents
|
|
|
|
Raises:
|
|
HTTPException: If an error occurs while retrieving documents (500).
|
|
"""
|
|
try:
|
|
# Get paginated documents and status counts in parallel
|
|
docs_task = rag.doc_status.get_docs_paginated(
|
|
status_filter=request.status_filter,
|
|
page=request.page,
|
|
page_size=request.page_size,
|
|
sort_field=request.sort_field,
|
|
sort_direction=request.sort_direction,
|
|
)
|
|
status_counts_task = rag.doc_status.get_all_status_counts()
|
|
|
|
# Execute both queries in parallel
|
|
(documents_with_ids, total_count), status_counts = await asyncio.gather(docs_task, status_counts_task)
|
|
|
|
# Convert documents to response format
|
|
doc_responses = []
|
|
for doc_id, doc in documents_with_ids:
|
|
doc_responses.append(
|
|
DocStatusResponse(
|
|
id=doc_id,
|
|
content_summary=doc.content_summary,
|
|
content_length=doc.content_length,
|
|
status=doc.status,
|
|
created_at=format_datetime(doc.created_at),
|
|
updated_at=format_datetime(doc.updated_at),
|
|
track_id=doc.track_id,
|
|
chunks_count=doc.chunks_count,
|
|
error_msg=doc.error_msg,
|
|
metadata=doc.metadata,
|
|
file_path=doc.file_path,
|
|
s3_key=doc.s3_key,
|
|
)
|
|
)
|
|
|
|
# Calculate pagination info
|
|
total_pages = (total_count + request.page_size - 1) // request.page_size
|
|
has_next = request.page < total_pages
|
|
has_prev = request.page > 1
|
|
|
|
pagination = PaginationInfo(
|
|
page=request.page,
|
|
page_size=request.page_size,
|
|
total_count=total_count,
|
|
total_pages=total_pages,
|
|
has_next=has_next,
|
|
has_prev=has_prev,
|
|
)
|
|
|
|
return PaginatedDocsResponse(
|
|
documents=doc_responses,
|
|
pagination=pagination,
|
|
status_counts=status_counts,
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f'Error getting paginated documents: {e!s}')
|
|
logger.error(traceback.format_exc())
|
|
raise HTTPException(status_code=500, detail=str(e)) from e
|
|
|
|
@router.get(
|
|
'/status_counts',
|
|
response_model=StatusCountsResponse,
|
|
dependencies=[Depends(combined_auth)],
|
|
)
|
|
async def get_document_status_counts() -> StatusCountsResponse:
|
|
"""
|
|
Get counts of documents by status.
|
|
|
|
This endpoint retrieves the count of documents in each processing status
|
|
(PENDING, PROCESSING, PROCESSED, FAILED) for all documents in the system.
|
|
|
|
Returns:
|
|
StatusCountsResponse: A response object containing status counts
|
|
|
|
Raises:
|
|
HTTPException: If an error occurs while retrieving status counts (500).
|
|
"""
|
|
try:
|
|
status_counts = await rag.doc_status.get_all_status_counts()
|
|
return StatusCountsResponse(status_counts=status_counts)
|
|
|
|
except Exception as e:
|
|
logger.error(f'Error getting document status counts: {e!s}')
|
|
logger.error(traceback.format_exc())
|
|
raise HTTPException(status_code=500, detail=str(e)) from e
|
|
|
|
@router.post(
|
|
'/reprocess_failed',
|
|
response_model=ReprocessResponse,
|
|
dependencies=[Depends(combined_auth)],
|
|
)
|
|
async def reprocess_failed_documents(background_tasks: BackgroundTasks):
|
|
"""
|
|
Reprocess failed and pending documents.
|
|
|
|
This endpoint triggers the document processing pipeline which automatically
|
|
picks up and reprocesses documents in the following statuses:
|
|
- FAILED: Documents that failed during previous processing attempts
|
|
- PENDING: Documents waiting to be processed
|
|
- PROCESSING: Documents with abnormally terminated processing (e.g., server crashes)
|
|
|
|
This is useful for recovering from server crashes, network errors, LLM service
|
|
outages, or other temporary failures that caused document processing to fail.
|
|
|
|
The processing happens in the background and can be monitored by checking the
|
|
pipeline status. The reprocessed documents retain their original track_id from
|
|
initial upload, so use their original track_id to monitor progress.
|
|
|
|
Returns:
|
|
ReprocessResponse: Response with status and message.
|
|
track_id is always empty string because reprocessed documents retain
|
|
their original track_id from initial upload.
|
|
|
|
Raises:
|
|
HTTPException: If an error occurs while initiating reprocessing (500).
|
|
"""
|
|
try:
|
|
# Start the reprocessing in the background
|
|
# Note: Reprocessed documents retain their original track_id from initial upload
|
|
background_tasks.add_task(rag.apipeline_process_enqueue_documents)
|
|
logger.info('Reprocessing of failed documents initiated')
|
|
|
|
return ReprocessResponse(
|
|
status='reprocessing_started',
|
|
message='Reprocessing of failed documents has been initiated in background. Documents retain their original track_id.',
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f'Error initiating reprocessing of failed documents: {e!s}')
|
|
logger.error(traceback.format_exc())
|
|
raise HTTPException(status_code=500, detail=str(e)) from e
|
|
|
|
@router.post(
|
|
'/cancel_pipeline',
|
|
response_model=CancelPipelineResponse,
|
|
dependencies=[Depends(combined_auth)],
|
|
)
|
|
async def cancel_pipeline():
|
|
"""
|
|
Request cancellation of the currently running pipeline.
|
|
|
|
This endpoint sets a cancellation flag in the pipeline status. The pipeline will:
|
|
1. Check this flag at key processing points
|
|
2. Stop processing new documents
|
|
3. Cancel all running document processing tasks
|
|
4. Mark all PROCESSING documents as FAILED with reason "User cancelled"
|
|
|
|
The cancellation is graceful and ensures data consistency. Documents that have
|
|
completed processing will remain in PROCESSED status.
|
|
|
|
Returns:
|
|
CancelPipelineResponse: Response with status and message
|
|
- status="cancellation_requested": Cancellation flag has been set
|
|
- status="not_busy": Pipeline is not currently running
|
|
|
|
Raises:
|
|
HTTPException: If an error occurs while setting cancellation flag (500).
|
|
"""
|
|
try:
|
|
from lightrag.kg.shared_storage import (
|
|
get_namespace_data,
|
|
get_namespace_lock,
|
|
)
|
|
|
|
pipeline_status = await get_namespace_data('pipeline_status', workspace=rag.workspace)
|
|
pipeline_status_lock = get_namespace_lock('pipeline_status', workspace=rag.workspace)
|
|
|
|
async with pipeline_status_lock:
|
|
if not pipeline_status.get('busy', False):
|
|
return CancelPipelineResponse(
|
|
status='not_busy',
|
|
message='Pipeline is not currently running. No cancellation needed.',
|
|
)
|
|
|
|
# Set cancellation flag
|
|
pipeline_status['cancellation_requested'] = True
|
|
cancel_msg = 'Pipeline cancellation requested by user'
|
|
logger.info(cancel_msg)
|
|
pipeline_status['latest_message'] = cancel_msg
|
|
pipeline_status['history_messages'].append(cancel_msg)
|
|
|
|
return CancelPipelineResponse(
|
|
status='cancellation_requested',
|
|
message='Pipeline cancellation has been requested. Documents will be marked as FAILED.',
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f'Error requesting pipeline cancellation: {e!s}')
|
|
logger.error(traceback.format_exc())
|
|
raise HTTPException(status_code=500, detail=str(e)) from e
|
|
|
|
return router
|