Fix document filtering logic and improve logging for ignored docs

2025-08-16 17:22:08 +08:00 · 2025-08-16 17:22:08 +08:00 · 5591ef3ac8
commit 5591ef3ac8
parent 5d00c4c7a8
1 changed files with 12 additions and 10 deletions
--- a/lightrag/lightrag.py
+++ b/lightrag/lightrag.py
@ -1077,19 +1077,21 @@ class LightRAG:
        # 4. Filter out already processed documents
        # Get docs ids
        all_new_doc_ids = set(new_docs.keys())
-        # Exclude IDs of documents that are already in progress
+        # Exclude IDs of documents that are already enqueued
        unique_new_doc_ids = await self.doc_status.filter_keys(all_new_doc_ids)

-        # Log ignored document IDs
-        ignored_ids = [
-            doc_id for doc_id in unique_new_doc_ids if doc_id not in new_docs
-        ]
+        # Log ignored document IDs (documents that were filtered out because they already exist)
+        ignored_ids = list(all_new_doc_ids - unique_new_doc_ids)
        if ignored_ids:
-            logger.warning(
-                f"Ignoring {len(ignored_ids)} document IDs not found in new_docs"
-            )
            for doc_id in ignored_ids:
-                logger.warning(f"Ignored document ID: {doc_id}")
+                file_path = new_docs.get(doc_id, {}).get("file_path", "unknown_source")
+                logger.warning(
+                    f"Ignoring document ID (already exists): {doc_id} ({file_path})"
+                )
+            if len(ignored_ids) > 3:
+                logger.warning(
+                    f"Total Ignoring {len(ignored_ids)} document IDs that already exist in storage"
+                )

        # Filter new_docs to only include documents with unique IDs
        new_docs = {
@ -1099,7 +1101,7 @@ class LightRAG:
        }

        if not new_docs:
-            logger.info("No new unique documents were found.")
+            logger.warning("No new unique documents were found.")
            return

        # 5. Store document content in full_docs and status in doc_status