Fix document filtering logic and improve logging for ignored docs
This commit is contained in:
parent
5d00c4c7a8
commit
5591ef3ac8
1 changed files with 12 additions and 10 deletions
|
|
@ -1077,19 +1077,21 @@ class LightRAG:
|
||||||
# 4. Filter out already processed documents
|
# 4. Filter out already processed documents
|
||||||
# Get docs ids
|
# Get docs ids
|
||||||
all_new_doc_ids = set(new_docs.keys())
|
all_new_doc_ids = set(new_docs.keys())
|
||||||
# Exclude IDs of documents that are already in progress
|
# Exclude IDs of documents that are already enqueued
|
||||||
unique_new_doc_ids = await self.doc_status.filter_keys(all_new_doc_ids)
|
unique_new_doc_ids = await self.doc_status.filter_keys(all_new_doc_ids)
|
||||||
|
|
||||||
# Log ignored document IDs
|
# Log ignored document IDs (documents that were filtered out because they already exist)
|
||||||
ignored_ids = [
|
ignored_ids = list(all_new_doc_ids - unique_new_doc_ids)
|
||||||
doc_id for doc_id in unique_new_doc_ids if doc_id not in new_docs
|
|
||||||
]
|
|
||||||
if ignored_ids:
|
if ignored_ids:
|
||||||
logger.warning(
|
|
||||||
f"Ignoring {len(ignored_ids)} document IDs not found in new_docs"
|
|
||||||
)
|
|
||||||
for doc_id in ignored_ids:
|
for doc_id in ignored_ids:
|
||||||
logger.warning(f"Ignored document ID: {doc_id}")
|
file_path = new_docs.get(doc_id, {}).get("file_path", "unknown_source")
|
||||||
|
logger.warning(
|
||||||
|
f"Ignoring document ID (already exists): {doc_id} ({file_path})"
|
||||||
|
)
|
||||||
|
if len(ignored_ids) > 3:
|
||||||
|
logger.warning(
|
||||||
|
f"Total Ignoring {len(ignored_ids)} document IDs that already exist in storage"
|
||||||
|
)
|
||||||
|
|
||||||
# Filter new_docs to only include documents with unique IDs
|
# Filter new_docs to only include documents with unique IDs
|
||||||
new_docs = {
|
new_docs = {
|
||||||
|
|
@ -1099,7 +1101,7 @@ class LightRAG:
|
||||||
}
|
}
|
||||||
|
|
||||||
if not new_docs:
|
if not new_docs:
|
||||||
logger.info("No new unique documents were found.")
|
logger.warning("No new unique documents were found.")
|
||||||
return
|
return
|
||||||
|
|
||||||
# 5. Store document content in full_docs and status in doc_status
|
# 5. Store document content in full_docs and status in doc_status
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue