Change the API for deleting documents to support deleting multiple documents at once.
This commit is contained in:
parent
495d6c8cce
commit
51bb0471cd
2 changed files with 97 additions and 47 deletions
|
|
@ -260,14 +260,25 @@ Attributes:
|
||||||
|
|
||||||
|
|
||||||
class DeleteDocRequest(BaseModel):
|
class DeleteDocRequest(BaseModel):
|
||||||
doc_id: str = Field(..., description="The ID of the document to delete.")
|
doc_ids: List[str] = Field(..., description="The IDs of the documents to delete.")
|
||||||
|
|
||||||
@field_validator("doc_id", mode="after")
|
@field_validator("doc_ids", mode="after")
|
||||||
@classmethod
|
@classmethod
|
||||||
def validate_doc_id(cls, doc_id: str) -> str:
|
def validate_doc_ids(cls, doc_ids: List[str]) -> List[str]:
|
||||||
if not doc_id or not doc_id.strip():
|
if not doc_ids:
|
||||||
raise ValueError("Document ID cannot be empty")
|
raise ValueError("Document IDs list cannot be empty")
|
||||||
return doc_id.strip()
|
|
||||||
|
validated_ids = []
|
||||||
|
for doc_id in doc_ids:
|
||||||
|
if not doc_id or not doc_id.strip():
|
||||||
|
raise ValueError("Document ID cannot be empty")
|
||||||
|
validated_ids.append(doc_id.strip())
|
||||||
|
|
||||||
|
# Check for duplicates
|
||||||
|
if len(validated_ids) != len(set(validated_ids)):
|
||||||
|
raise ValueError("Document IDs must be unique")
|
||||||
|
|
||||||
|
return validated_ids
|
||||||
|
|
||||||
|
|
||||||
class DeleteEntityRequest(BaseModel):
|
class DeleteEntityRequest(BaseModel):
|
||||||
|
|
@ -782,8 +793,8 @@ async def run_scanning_process(rag: LightRAG, doc_manager: DocumentManager):
|
||||||
logger.error(traceback.format_exc())
|
logger.error(traceback.format_exc())
|
||||||
|
|
||||||
|
|
||||||
async def background_delete_document(rag: LightRAG, doc_id: str):
|
async def background_delete_documents(rag: LightRAG, doc_ids: List[str]):
|
||||||
"""Background task to delete a document"""
|
"""Background task to delete multiple documents"""
|
||||||
from lightrag.kg.shared_storage import (
|
from lightrag.kg.shared_storage import (
|
||||||
get_namespace_data,
|
get_namespace_data,
|
||||||
get_pipeline_status_lock,
|
get_pipeline_status_lock,
|
||||||
|
|
@ -792,13 +803,20 @@ async def background_delete_document(rag: LightRAG, doc_id: str):
|
||||||
pipeline_status = await get_namespace_data("pipeline_status")
|
pipeline_status = await get_namespace_data("pipeline_status")
|
||||||
pipeline_status_lock = get_pipeline_status_lock()
|
pipeline_status_lock = get_pipeline_status_lock()
|
||||||
|
|
||||||
|
total_docs = len(doc_ids)
|
||||||
|
successful_deletions = []
|
||||||
|
failed_deletions = []
|
||||||
|
|
||||||
# Set pipeline status to busy for deletion
|
# Set pipeline status to busy for deletion
|
||||||
async with pipeline_status_lock:
|
async with pipeline_status_lock:
|
||||||
pipeline_status.update(
|
pipeline_status.update(
|
||||||
{
|
{
|
||||||
"busy": True,
|
"busy": True,
|
||||||
"job_name": f"Deleting Document: {doc_id}",
|
"job_name": f"Deleting {total_docs} Documents",
|
||||||
"job_start": datetime.now().isoformat(),
|
"job_start": datetime.now().isoformat(),
|
||||||
|
"docs": total_docs,
|
||||||
|
"batchs": total_docs,
|
||||||
|
"cur_batch": 0,
|
||||||
"latest_message": "Starting document deletion process",
|
"latest_message": "Starting document deletion process",
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
@ -806,40 +824,81 @@ async def background_delete_document(rag: LightRAG, doc_id: str):
|
||||||
pipeline_status["history_messages"][:] = ["Starting document deletion process"]
|
pipeline_status["history_messages"][:] = ["Starting document deletion process"]
|
||||||
|
|
||||||
try:
|
try:
|
||||||
result = await rag.adelete_by_doc_id(doc_id)
|
# Loop through each document ID and delete them one by one
|
||||||
if "history_messages" in pipeline_status:
|
for i, doc_id in enumerate(doc_ids, 1):
|
||||||
pipeline_status["history_messages"].append(result.message)
|
async with pipeline_status_lock:
|
||||||
|
pipeline_status["cur_batch"] = i
|
||||||
|
pipeline_status["latest_message"] = (
|
||||||
|
f"Deleting document {i}/{total_docs}: {doc_id}"
|
||||||
|
)
|
||||||
|
pipeline_status["history_messages"].append(
|
||||||
|
f"Processing document {i}/{total_docs}: {doc_id}"
|
||||||
|
)
|
||||||
|
|
||||||
logger.info(f"Document deletion completed for {doc_id}: {result.status}")
|
try:
|
||||||
|
result = await rag.adelete_by_doc_id(doc_id)
|
||||||
|
|
||||||
|
if result.status == "success":
|
||||||
|
successful_deletions.append(doc_id)
|
||||||
|
success_msg = (
|
||||||
|
f"Successfully deleted document {i}/{total_docs}: {doc_id}"
|
||||||
|
)
|
||||||
|
logger.info(success_msg)
|
||||||
|
|
||||||
|
async with pipeline_status_lock:
|
||||||
|
pipeline_status["history_messages"].append(success_msg)
|
||||||
|
else:
|
||||||
|
failed_deletions.append(doc_id)
|
||||||
|
error_msg = f"Failed to delete document {i}/{total_docs}: {doc_id} - {result.message}"
|
||||||
|
logger.error(error_msg)
|
||||||
|
|
||||||
|
async with pipeline_status_lock:
|
||||||
|
pipeline_status["history_messages"].append(error_msg)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
failed_deletions.append(doc_id)
|
||||||
|
error_msg = (
|
||||||
|
f"Error deleting document {i}/{total_docs}: {doc_id} - {str(e)}"
|
||||||
|
)
|
||||||
|
logger.error(error_msg)
|
||||||
|
logger.error(traceback.format_exc())
|
||||||
|
|
||||||
|
async with pipeline_status_lock:
|
||||||
|
pipeline_status["history_messages"].append(error_msg)
|
||||||
|
|
||||||
|
# Final summary
|
||||||
|
summary_msg = f"Deletion completed: {len(successful_deletions)} successful, {len(failed_deletions)} failed"
|
||||||
|
logger.info(summary_msg)
|
||||||
|
|
||||||
|
async with pipeline_status_lock:
|
||||||
|
pipeline_status["history_messages"].append(summary_msg)
|
||||||
|
if successful_deletions:
|
||||||
|
pipeline_status["history_messages"].append(
|
||||||
|
f"Successfully deleted: {', '.join(successful_deletions)}"
|
||||||
|
)
|
||||||
|
if failed_deletions:
|
||||||
|
pipeline_status["history_messages"].append(
|
||||||
|
f"Failed to delete: {', '.join(failed_deletions)}"
|
||||||
|
)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
error_msg = f"Error deleting document {doc_id}: {str(e)}"
|
error_msg = f"Critical error during batch deletion: {str(e)}"
|
||||||
logger.error(error_msg)
|
logger.error(error_msg)
|
||||||
logger.error(traceback.format_exc())
|
logger.error(traceback.format_exc())
|
||||||
if "history_messages" in pipeline_status:
|
|
||||||
|
async with pipeline_status_lock:
|
||||||
pipeline_status["history_messages"].append(error_msg)
|
pipeline_status["history_messages"].append(error_msg)
|
||||||
finally:
|
finally:
|
||||||
async with pipeline_status_lock:
|
async with pipeline_status_lock:
|
||||||
pipeline_status["busy"] = False
|
pipeline_status["busy"] = False
|
||||||
completion_msg = "Document deletion process completed."
|
completion_msg = "Document deletion process completed."
|
||||||
pipeline_status["latest_message"] = completion_msg
|
pipeline_status["latest_message"] = completion_msg
|
||||||
if "history_messages" in pipeline_status:
|
pipeline_status["history_messages"].append(completion_msg)
|
||||||
pipeline_status["history_messages"].append(completion_msg)
|
|
||||||
|
|
||||||
|
|
||||||
def create_document_routes(
|
def create_document_routes(
|
||||||
rag: LightRAG, doc_manager: DocumentManager, api_key: Optional[str] = None
|
rag: LightRAG, doc_manager: DocumentManager, api_key: Optional[str] = None
|
||||||
):
|
):
|
||||||
# Check if doc_id exists in the system - add this check at the beginning
|
|
||||||
async def check_doc_id_exists(doc_id: str) -> bool:
|
|
||||||
"""Check if document ID exists in the system"""
|
|
||||||
try:
|
|
||||||
doc_status = await rag.doc_status.get_by_id(doc_id)
|
|
||||||
return doc_status is not None
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error checking doc_id existence: {str(e)}")
|
|
||||||
return False
|
|
||||||
|
|
||||||
# Create combined auth dependency for document routes
|
# Create combined auth dependency for document routes
|
||||||
combined_auth = get_combined_auth_dependency(api_key)
|
combined_auth = get_combined_auth_dependency(api_key)
|
||||||
|
|
||||||
|
|
@ -1326,9 +1385,9 @@ def create_document_routes(
|
||||||
background_tasks: BackgroundTasks,
|
background_tasks: BackgroundTasks,
|
||||||
) -> DeleteDocByIdResponse:
|
) -> DeleteDocByIdResponse:
|
||||||
"""
|
"""
|
||||||
Delete a document and all its associated data by its ID using background processing.
|
Delete documents and all their associated data by their IDs using background processing.
|
||||||
|
|
||||||
Deletes a specific document and all its associated data, including its status,
|
Deletes specific documents and all their associated data, including their status,
|
||||||
text chunks, vector embeddings, and any related graph data.
|
text chunks, vector embeddings, and any related graph data.
|
||||||
The deletion process runs in the background to avoid blocking the client connection.
|
The deletion process runs in the background to avoid blocking the client connection.
|
||||||
It is disabled when llm cache for entity extraction is disabled.
|
It is disabled when llm cache for entity extraction is disabled.
|
||||||
|
|
@ -1336,7 +1395,7 @@ def create_document_routes(
|
||||||
This operation is irreversible and will interact with the pipeline status.
|
This operation is irreversible and will interact with the pipeline status.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
delete_request (DeleteDocRequest): The request containing the document ID.
|
delete_request (DeleteDocRequest): The request containing the document IDs.
|
||||||
background_tasks: FastAPI BackgroundTasks for async processing
|
background_tasks: FastAPI BackgroundTasks for async processing
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
|
|
@ -1349,10 +1408,7 @@ def create_document_routes(
|
||||||
HTTPException:
|
HTTPException:
|
||||||
- 500: If an unexpected internal error occurs during initialization.
|
- 500: If an unexpected internal error occurs during initialization.
|
||||||
"""
|
"""
|
||||||
# Check if doc_id exists first - return error immediately if not found
|
doc_ids = delete_request.doc_ids
|
||||||
doc_id = delete_request.doc_id
|
|
||||||
if not await check_doc_id_exists(doc_id):
|
|
||||||
raise HTTPException(status_code=404, detail=f"Document {doc_id} not found.")
|
|
||||||
|
|
||||||
# The rag object is initialized from the server startup args,
|
# The rag object is initialized from the server startup args,
|
||||||
# so we can access its properties here.
|
# so we can access its properties here.
|
||||||
|
|
@ -1360,7 +1416,7 @@ def create_document_routes(
|
||||||
return DeleteDocByIdResponse(
|
return DeleteDocByIdResponse(
|
||||||
status="not_allowed",
|
status="not_allowed",
|
||||||
message="Operation not allowed when LLM cache for entity extraction is disabled.",
|
message="Operation not allowed when LLM cache for entity extraction is disabled.",
|
||||||
doc_id=delete_request.doc_id,
|
doc_id=", ".join(delete_request.doc_ids),
|
||||||
)
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
|
@ -1372,21 +1428,21 @@ def create_document_routes(
|
||||||
if pipeline_status.get("busy", False):
|
if pipeline_status.get("busy", False):
|
||||||
return DeleteDocByIdResponse(
|
return DeleteDocByIdResponse(
|
||||||
status="busy",
|
status="busy",
|
||||||
message="Cannot delete document while pipeline is busy",
|
message="Cannot delete documents while pipeline is busy",
|
||||||
doc_id=doc_id,
|
doc_id=", ".join(doc_ids),
|
||||||
)
|
)
|
||||||
|
|
||||||
# Add deletion task to background tasks
|
# Add deletion task to background tasks
|
||||||
background_tasks.add_task(background_delete_document, rag, doc_id)
|
background_tasks.add_task(background_delete_documents, rag, doc_ids)
|
||||||
|
|
||||||
return DeleteDocByIdResponse(
|
return DeleteDocByIdResponse(
|
||||||
status="deletion_started",
|
status="deletion_started",
|
||||||
message=f"Document deletion for '{doc_id}' has been initiated. Processing will continue in background.",
|
message=f"Document deletion for {len(doc_ids)} documents has been initiated. Processing will continue in background.",
|
||||||
doc_id=doc_id,
|
doc_id=", ".join(doc_ids),
|
||||||
)
|
)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
error_msg = f"Error initiating document deletion for {delete_request.doc_id}: {str(e)}"
|
error_msg = f"Error initiating document deletion for {delete_request.doc_ids}: {str(e)}"
|
||||||
logger.error(error_msg)
|
logger.error(error_msg)
|
||||||
logger.error(traceback.format_exc())
|
logger.error(traceback.format_exc())
|
||||||
raise HTTPException(status_code=500, detail=error_msg)
|
raise HTTPException(status_code=500, detail=error_msg)
|
||||||
|
|
|
||||||
|
|
@ -1961,12 +1961,6 @@ class LightRAG:
|
||||||
logger.error(f"Failed to delete document and status: {e}")
|
logger.error(f"Failed to delete document and status: {e}")
|
||||||
raise Exception(f"Failed to delete document and status: {e}") from e
|
raise Exception(f"Failed to delete document and status: {e}") from e
|
||||||
|
|
||||||
async with pipeline_status_lock:
|
|
||||||
log_message = f"Successfully deleted document {doc_id}"
|
|
||||||
logger.info(log_message)
|
|
||||||
pipeline_status["latest_message"] = log_message
|
|
||||||
pipeline_status["history_messages"].append(log_message)
|
|
||||||
|
|
||||||
return DeletionResult(
|
return DeletionResult(
|
||||||
status="success",
|
status="success",
|
||||||
doc_id=doc_id,
|
doc_id=doc_id,
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue