From 9ed5b93467ba19ae77e33e0ca43648a480a31b70 Mon Sep 17 00:00:00 2001 From: yangdx Date: Tue, 19 Aug 2025 11:33:28 +0800 Subject: [PATCH] Add [File Extraction] prefix to error messages and logs --- lightrag/api/routers/document_routes.py | 60 ++++++++++++++----------- 1 file changed, 34 insertions(+), 26 deletions(-) diff --git a/lightrag/api/routers/document_routes.py b/lightrag/api/routers/document_routes.py index fe2bd9a5..c9dc2617 100644 --- a/lightrag/api/routers/document_routes.py +++ b/lightrag/api/routers/document_routes.py @@ -814,37 +814,41 @@ async def pipeline_enqueue_file( error_files = [ { "file_path": str(file_path.name), - "error_description": "Permission denied - cannot read file", + "error_description": "[File Extraction]Permission denied - cannot read file", "original_error": str(e), "file_size": file_size, } ] await rag.apipeline_enqueue_error_documents(error_files, track_id) - logger.error(f"Permission denied reading file: {file_path.name}") + logger.error( + f"[File Extraction]Permission denied reading file: {file_path.name}" + ) return False, track_id except FileNotFoundError as e: error_files = [ { "file_path": str(file_path.name), - "error_description": "File not found", + "error_description": "[File Extraction]File not found", "original_error": str(e), "file_size": file_size, } ] await rag.apipeline_enqueue_error_documents(error_files, track_id) - logger.error(f"File not found: {file_path.name}") + logger.error(f"[File Extraction]File not found: {file_path.name}") return False, track_id except Exception as e: error_files = [ { "file_path": str(file_path.name), - "error_description": "File reading error", + "error_description": "[File Extraction]File reading error", "original_error": str(e), "file_size": file_size, } ] await rag.apipeline_enqueue_error_documents(error_files, track_id) - logger.error(f"Error reading file {file_path.name}: {str(e)}") + logger.error( + f"[File Extraction]Error reading file {file_path.name}: {str(e)}" + ) return False, track_id # Process based on file type @@ -894,7 +898,7 @@ async def pipeline_enqueue_file( error_files = [ { "file_path": str(file_path.name), - "error_description": "Empty file content", + "error_description": "[File Extraction]Empty file content", "original_error": "File contains no content or only whitespace", "file_size": file_size, } @@ -902,7 +906,9 @@ async def pipeline_enqueue_file( await rag.apipeline_enqueue_error_documents( error_files, track_id ) - logger.error(f"Empty content in file: {file_path.name}") + logger.error( + f"[File Extraction]Empty content in file: {file_path.name}" + ) return False, track_id # Check if content looks like binary data string representation @@ -910,7 +916,7 @@ async def pipeline_enqueue_file( error_files = [ { "file_path": str(file_path.name), - "error_description": "Binary data in text file", + "error_description": "[File Extraction]Binary data in text file", "original_error": "File appears to contain binary data representation instead of text", "file_size": file_size, } @@ -919,7 +925,7 @@ async def pipeline_enqueue_file( error_files, track_id ) logger.error( - f"File {file_path.name} appears to contain binary data representation instead of text" + f"[File Extraction]File {file_path.name} appears to contain binary data representation instead of text" ) return False, track_id @@ -927,7 +933,7 @@ async def pipeline_enqueue_file( error_files = [ { "file_path": str(file_path.name), - "error_description": "UTF-8 encoding error", + "error_description": "[File Extraction]UTF-8 encoding error, please convert it to UTF-8 before processing", "original_error": f"File is not valid UTF-8 encoded text: {str(e)}", "file_size": file_size, } @@ -936,7 +942,7 @@ async def pipeline_enqueue_file( error_files, track_id ) logger.error( - f"File {file_path.name} is not valid UTF-8 encoded text. Please convert it to UTF-8 before processing." + f"[File Extraction]File {file_path.name} is not valid UTF-8 encoded text. Please convert it to UTF-8 before processing." ) return False, track_id @@ -964,7 +970,7 @@ async def pipeline_enqueue_file( error_files = [ { "file_path": str(file_path.name), - "error_description": "PDF processing error", + "error_description": "[File Extraction]PDF processing error", "original_error": f"Failed to extract text from PDF: {str(e)}", "file_size": file_size, } @@ -972,7 +978,9 @@ async def pipeline_enqueue_file( await rag.apipeline_enqueue_error_documents( error_files, track_id ) - logger.error(f"Error processing PDF {file_path.name}: {str(e)}") + logger.error( + f"[File Extraction]Error processing PDF {file_path.name}: {str(e)}" + ) return False, track_id case ".docx": @@ -1003,7 +1011,7 @@ async def pipeline_enqueue_file( error_files = [ { "file_path": str(file_path.name), - "error_description": "DOCX processing error", + "error_description": "[File Extraction]DOCX processing error", "original_error": f"Failed to extract text from DOCX: {str(e)}", "file_size": file_size, } @@ -1012,7 +1020,7 @@ async def pipeline_enqueue_file( error_files, track_id ) logger.error( - f"Error processing DOCX {file_path.name}: {str(e)}" + f"[File Extraction]Error processing DOCX {file_path.name}: {str(e)}" ) return False, track_id @@ -1042,7 +1050,7 @@ async def pipeline_enqueue_file( error_files = [ { "file_path": str(file_path.name), - "error_description": "PPTX processing error", + "error_description": "[File Extraction]PPTX processing error", "original_error": f"Failed to extract text from PPTX: {str(e)}", "file_size": file_size, } @@ -1051,7 +1059,7 @@ async def pipeline_enqueue_file( error_files, track_id ) logger.error( - f"Error processing PPTX {file_path.name}: {str(e)}" + f"[File Extraction]Error processing PPTX {file_path.name}: {str(e)}" ) return False, track_id @@ -1088,7 +1096,7 @@ async def pipeline_enqueue_file( error_files = [ { "file_path": str(file_path.name), - "error_description": "XLSX processing error", + "error_description": "[File Extraction]XLSX processing error", "original_error": f"Failed to extract text from XLSX: {str(e)}", "file_size": file_size, } @@ -1097,7 +1105,7 @@ async def pipeline_enqueue_file( error_files, track_id ) logger.error( - f"Error processing XLSX {file_path.name}: {str(e)}" + f"[File Extraction]Error processing XLSX {file_path.name}: {str(e)}" ) return False, track_id @@ -1105,14 +1113,14 @@ async def pipeline_enqueue_file( error_files = [ { "file_path": str(file_path.name), - "error_description": f"Unsupported file type: {ext}", + "error_description": f"[File Extraction]Unsupported file type: {ext}", "original_error": f"File extension {ext} is not supported", "file_size": file_size, } ] await rag.apipeline_enqueue_error_documents(error_files, track_id) logger.error( - f"Unsupported file type: {file_path.name} (extension {ext})" + f"[File Extraction]Unsupported file type: {file_path.name} (extension {ext})" ) return False, track_id @@ -1120,14 +1128,14 @@ async def pipeline_enqueue_file( error_files = [ { "file_path": str(file_path.name), - "error_description": "File format processing error", + "error_description": "[File Extraction]File format processing error", "original_error": f"Unexpected error during file extracting: {str(e)}", "file_size": file_size, } ] await rag.apipeline_enqueue_error_documents(error_files, track_id) logger.error( - f"Unexpected error during {file_path.name} extracting: {str(e)}" + f"[File Extraction]Unexpected error during {file_path.name} extracting: {str(e)}" ) return False, track_id @@ -1138,14 +1146,14 @@ async def pipeline_enqueue_file( error_files = [ { "file_path": str(file_path.name), - "error_description": "File contains only whitespace", + "error_description": "[File Extraction]File contains only whitespace", "original_error": "File content contains only whitespace characters", "file_size": file_size, } ] await rag.apipeline_enqueue_error_documents(error_files, track_id) logger.warning( - f"File contains only whitespace characters: {file_path.name}" + f"[File Extraction]File contains only whitespace characters: {file_path.name}" ) return False, track_id