Optimize document processing pipeline by removing duplicate step

2025-08-16 17:23:01 +08:00 · 2025-08-16 17:23:01 +08:00 · e1310c5262
commit e1310c5262
parent 5591ef3ac8
1 changed files with 24 additions and 36 deletions
--- a/lightrag/lightrag.py
+++ b/lightrag/lightrag.py
@ -971,11 +971,10 @@ class LightRAG:
        """
        Pipeline for Processing Documents
-        1. Validate ids if provided or generate MD5 hash IDs
+        1. Validate ids if provided or generate MD5 hash IDs and remove duplicate contents
-        2. Remove duplicate contents
+        2. Generate document initial status
-        3. Generate document initial status
+        3. Filter out already processed documents
-        4. Filter out already processed documents
+        4. Enqueue document in status
        5. Enqueue document in status
        Args:
            input: Single document string or list of document strings
@ -1008,7 +1007,7 @@ class LightRAG:
            # If no file paths provided, use placeholder
            file_paths = ["unknown_source"] * len(input)
-        # 1. Validate ids if provided or generate MD5 hash IDs
+        # 1. Validate ids if provided or generate MD5 hash IDs and remove duplicate contents
        if ids is not None:
            # Check if the number of IDs matches the number of documents
            if len(ids) != len(input):
@ -1018,22 +1017,25 @@ class LightRAG:
            if len(ids) != len(set(ids)):
                raise ValueError("IDs must be unique")
-            # Generate contents dict of IDs provided by user and documents
+            # Generate contents dict and remove duplicates in one pass
            unique_contents = {}
            for id_, doc, path in zip(ids, input, file_paths):
                cleaned_content = clean_text(doc)
                if cleaned_content not in unique_contents:
                    unique_contents[cleaned_content] = (id_, path)
            # Reconstruct contents with unique content
            contents = {
-                id_: {"content": doc, "file_path": path}
+                id_: {"content": content, "file_path": file_path}
-                for id_, doc, path in zip(ids, input, file_paths)
+                for content, (id_, file_path) in unique_contents.items()
            }
        else:
-            # Clean input text and remove duplicates
+            # Clean input text and remove duplicates in one pass
            cleaned_input = [
                (clean_text(doc), path) for doc, path in zip(input, file_paths)
            ]
            unique_content_with_paths = {}
-
+            for doc, path in zip(input, file_paths):
-            # Keep track of unique content and their paths
+                cleaned_content = clean_text(doc)
-            for content, path in cleaned_input:
+                if cleaned_content not in unique_content_with_paths:
-                if content not in unique_content_with_paths:
+                    unique_content_with_paths[cleaned_content] = path
                    unique_content_with_paths[content] = path
            # Generate contents dict of MD5 hash IDs and documents with paths
            contents = {
@ -1044,21 +1046,7 @@ class LightRAG:
                for content, path in unique_content_with_paths.items()
            }
-        # 2. Remove duplicate contents
+        # 2. Generate document initial status (without content)
        unique_contents = {}
        for id_, content_data in contents.items():
            content = content_data["content"]
            file_path = content_data["file_path"]
            if content not in unique_contents:
                unique_contents[content] = (id_, file_path)
        # Reconstruct contents with unique content
        contents = {
            id_: {"content": content, "file_path": file_path}
            for content, (id_, file_path) in unique_contents.items()
        }
        # 3. Generate document initial status (without content)
        new_docs: dict[str, Any] = {
            id_: {
                "status": DocStatus.PENDING,
@ -1074,7 +1062,7 @@ class LightRAG:
            for id_, content_data in contents.items()
        }
-        # 4. Filter out already processed documents
+        # 3. Filter out already processed documents
        # Get docs ids
        all_new_doc_ids = set(new_docs.keys())
        # Exclude IDs of documents that are already enqueued
@ -1104,8 +1092,8 @@ class LightRAG:
            logger.warning("No new unique documents were found.")
            return
-        # 5. Store document content in full_docs and status in doc_status
+        # 4. Store document content in full_docs and status in doc_status
-        # Store full document content separately
+        #    Store full document content separately
        full_docs_data = {
            doc_id: {"content": contents[doc_id]["content"]}
            for doc_id in new_docs.keys()