fix: subdirectories are no longer processed during file scans

• Change rglob to glob for file scanning
• Simplify error logging messages
This commit is contained in:
yangdx 2025-08-16 23:46:33 +08:00
parent f5b0c3d38c
commit cceb46b320
2 changed files with 10 additions and 14 deletions

View file

@ -734,7 +734,7 @@ class DocumentManager:
new_files = []
for ext in self.supported_extensions:
logger.debug(f"Scanning for {ext} files in {self.input_dir}")
for file_path in self.input_dir.rglob(f"*{ext}"):
for file_path in self.input_dir.glob(f"*{ext}"):
if file_path not in self.indexed_files:
new_files.append(file_path)
return new_files
@ -1122,12 +1122,14 @@ async def pipeline_enqueue_file(
{
"file_path": str(file_path.name),
"error_description": "File format processing error",
"original_error": f"Unexpected error during file processing: {str(e)}",
"original_error": f"Unexpected error during file extracting: {str(e)}",
"file_size": file_size,
}
]
await rag.apipeline_enqueue_error_documents(error_files, track_id)
logger.error(f"Unexpected error processing file {file_path.name}: {str(e)}")
logger.error(
f"Unexpected error during {file_path.name} extracting: {str(e)}"
)
return False, track_id
# Insert into the RAG queue
@ -1144,7 +1146,7 @@ async def pipeline_enqueue_file(
]
await rag.apipeline_enqueue_error_documents(error_files, track_id)
logger.warning(
f"File contains only whitespace characters. file_paths={file_path.name}"
f"File contains only whitespace characters: {file_path.name}"
)
return False, track_id
@ -1168,7 +1170,7 @@ async def pipeline_enqueue_file(
# Move the file
file_path.rename(target_path)
logger.info(
logger.debug(
f"Moved file to enqueued directory: {file_path.name} -> {unique_filename}"
)
@ -1202,7 +1204,7 @@ async def pipeline_enqueue_file(
}
]
await rag.apipeline_enqueue_error_documents(error_files, track_id)
logger.error(f"No content could be extracted from file: {file_path.name}")
logger.error(f"No content extracted from file: {file_path.name}")
return False, track_id
except Exception as e:
@ -1221,7 +1223,7 @@ async def pipeline_enqueue_file(
}
]
await rag.apipeline_enqueue_error_documents(error_files, track_id)
logger.error(f"Error processing or enqueueing file {file_path.name}: {str(e)}")
logger.error(f"Enqueuing file {file_path.name} error: {str(e)}")
logger.error(traceback.format_exc())
return False, track_id
finally:

View file

@ -1173,16 +1173,10 @@ class LightRAG:
# Store error documents in doc_status
if error_docs:
await self.doc_status.upsert(error_docs)
logger.info(
f"Recorded {len(error_docs)} file extraction errors in doc_status"
)
# Log each error for debugging
for doc_id, error_doc in error_docs.items():
logger.error(
f"File extraction error recorded - ID: {doc_id}, "
f"File: {error_doc['file_path']}, "
f"Error: {error_doc['content_summary']}"
f"File processing error: - ID: {doc_id} {error_doc['file_path']}"
)
async def _validate_and_fix_document_consistency(