fix: subdirectories are no longer processed during file scans
• Change rglob to glob for file scanning • Simplify error logging messages
This commit is contained in:
parent
f5b0c3d38c
commit
cceb46b320
2 changed files with 10 additions and 14 deletions
|
|
@ -734,7 +734,7 @@ class DocumentManager:
|
|||
new_files = []
|
||||
for ext in self.supported_extensions:
|
||||
logger.debug(f"Scanning for {ext} files in {self.input_dir}")
|
||||
for file_path in self.input_dir.rglob(f"*{ext}"):
|
||||
for file_path in self.input_dir.glob(f"*{ext}"):
|
||||
if file_path not in self.indexed_files:
|
||||
new_files.append(file_path)
|
||||
return new_files
|
||||
|
|
@ -1122,12 +1122,14 @@ async def pipeline_enqueue_file(
|
|||
{
|
||||
"file_path": str(file_path.name),
|
||||
"error_description": "File format processing error",
|
||||
"original_error": f"Unexpected error during file processing: {str(e)}",
|
||||
"original_error": f"Unexpected error during file extracting: {str(e)}",
|
||||
"file_size": file_size,
|
||||
}
|
||||
]
|
||||
await rag.apipeline_enqueue_error_documents(error_files, track_id)
|
||||
logger.error(f"Unexpected error processing file {file_path.name}: {str(e)}")
|
||||
logger.error(
|
||||
f"Unexpected error during {file_path.name} extracting: {str(e)}"
|
||||
)
|
||||
return False, track_id
|
||||
|
||||
# Insert into the RAG queue
|
||||
|
|
@ -1144,7 +1146,7 @@ async def pipeline_enqueue_file(
|
|||
]
|
||||
await rag.apipeline_enqueue_error_documents(error_files, track_id)
|
||||
logger.warning(
|
||||
f"File contains only whitespace characters. file_paths={file_path.name}"
|
||||
f"File contains only whitespace characters: {file_path.name}"
|
||||
)
|
||||
return False, track_id
|
||||
|
||||
|
|
@ -1168,7 +1170,7 @@ async def pipeline_enqueue_file(
|
|||
|
||||
# Move the file
|
||||
file_path.rename(target_path)
|
||||
logger.info(
|
||||
logger.debug(
|
||||
f"Moved file to enqueued directory: {file_path.name} -> {unique_filename}"
|
||||
)
|
||||
|
||||
|
|
@ -1202,7 +1204,7 @@ async def pipeline_enqueue_file(
|
|||
}
|
||||
]
|
||||
await rag.apipeline_enqueue_error_documents(error_files, track_id)
|
||||
logger.error(f"No content could be extracted from file: {file_path.name}")
|
||||
logger.error(f"No content extracted from file: {file_path.name}")
|
||||
return False, track_id
|
||||
|
||||
except Exception as e:
|
||||
|
|
@ -1221,7 +1223,7 @@ async def pipeline_enqueue_file(
|
|||
}
|
||||
]
|
||||
await rag.apipeline_enqueue_error_documents(error_files, track_id)
|
||||
logger.error(f"Error processing or enqueueing file {file_path.name}: {str(e)}")
|
||||
logger.error(f"Enqueuing file {file_path.name} error: {str(e)}")
|
||||
logger.error(traceback.format_exc())
|
||||
return False, track_id
|
||||
finally:
|
||||
|
|
|
|||
|
|
@ -1173,16 +1173,10 @@ class LightRAG:
|
|||
# Store error documents in doc_status
|
||||
if error_docs:
|
||||
await self.doc_status.upsert(error_docs)
|
||||
logger.info(
|
||||
f"Recorded {len(error_docs)} file extraction errors in doc_status"
|
||||
)
|
||||
|
||||
# Log each error for debugging
|
||||
for doc_id, error_doc in error_docs.items():
|
||||
logger.error(
|
||||
f"File extraction error recorded - ID: {doc_id}, "
|
||||
f"File: {error_doc['file_path']}, "
|
||||
f"Error: {error_doc['content_summary']}"
|
||||
f"File processing error: - ID: {doc_id} {error_doc['file_path']}"
|
||||
)
|
||||
|
||||
async def _validate_and_fix_document_consistency(
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue