diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py
index bef965fa..8be61205 100644
--- a/lightrag/lightrag.py
+++ b/lightrag/lightrag.py
@@ -971,11 +971,10 @@ class LightRAG:
         """
         Pipeline for Processing Documents
 
-        1. Validate ids if provided or generate MD5 hash IDs
-        2. Remove duplicate contents
-        3. Generate document initial status
-        4. Filter out already processed documents
-        5. Enqueue document in status
+        1. Validate ids if provided or generate MD5 hash IDs and remove duplicate contents
+        2. Generate document initial status
+        3. Filter out already processed documents
+        4. Enqueue document in status
 
         Args:
             input: Single document string or list of document strings
@@ -1008,7 +1007,7 @@ class LightRAG:
             # If no file paths provided, use placeholder
             file_paths = ["unknown_source"] * len(input)
 
-        # 1. Validate ids if provided or generate MD5 hash IDs
+        # 1. Validate ids if provided or generate MD5 hash IDs and remove duplicate contents
         if ids is not None:
             # Check if the number of IDs matches the number of documents
             if len(ids) != len(input):
@@ -1018,22 +1017,25 @@ class LightRAG:
             if len(ids) != len(set(ids)):
                 raise ValueError("IDs must be unique")
 
-            # Generate contents dict of IDs provided by user and documents
+            # Generate contents dict and remove duplicates in one pass
+            unique_contents = {}
+            for id_, doc, path in zip(ids, input, file_paths):
+                cleaned_content = clean_text(doc)
+                if cleaned_content not in unique_contents:
+                    unique_contents[cleaned_content] = (id_, path)
+
+            # Reconstruct contents with unique content
             contents = {
-                id_: {"content": doc, "file_path": path}
-                for id_, doc, path in zip(ids, input, file_paths)
+                id_: {"content": content, "file_path": file_path}
+                for content, (id_, file_path) in unique_contents.items()
             }
         else:
-            # Clean input text and remove duplicates
-            cleaned_input = [
-                (clean_text(doc), path) for doc, path in zip(input, file_paths)
-            ]
+            # Clean input text and remove duplicates in one pass
             unique_content_with_paths = {}
-
-            # Keep track of unique content and their paths
-            for content, path in cleaned_input:
-                if content not in unique_content_with_paths:
-                    unique_content_with_paths[content] = path
+            for doc, path in zip(input, file_paths):
+                cleaned_content = clean_text(doc)
+                if cleaned_content not in unique_content_with_paths:
+                    unique_content_with_paths[cleaned_content] = path
 
             # Generate contents dict of MD5 hash IDs and documents with paths
             contents = {
@@ -1044,21 +1046,7 @@ class LightRAG:
                 for content, path in unique_content_with_paths.items()
             }
 
-        # 2. Remove duplicate contents
-        unique_contents = {}
-        for id_, content_data in contents.items():
-            content = content_data["content"]
-            file_path = content_data["file_path"]
-            if content not in unique_contents:
-                unique_contents[content] = (id_, file_path)
-
-        # Reconstruct contents with unique content
-        contents = {
-            id_: {"content": content, "file_path": file_path}
-            for content, (id_, file_path) in unique_contents.items()
-        }
-
-        # 3. Generate document initial status (without content)
+        # 2. Generate document initial status (without content)
         new_docs: dict[str, Any] = {
             id_: {
                 "status": DocStatus.PENDING,
@@ -1074,7 +1062,7 @@ class LightRAG:
             for id_, content_data in contents.items()
         }
 
-        # 4. Filter out already processed documents
+        # 3. Filter out already processed documents
         # Get docs ids
         all_new_doc_ids = set(new_docs.keys())
         # Exclude IDs of documents that are already enqueued
@@ -1104,8 +1092,8 @@ class LightRAG:
             logger.warning("No new unique documents were found.")
             return
 
-        # 5. Store document content in full_docs and status in doc_status
-        # Store full document content separately
+        # 4. Store document content in full_docs and status in doc_status
+        #    Store full document content separately
         full_docs_data = {
             doc_id: {"content": contents[doc_id]["content"]}
             for doc_id in new_docs.keys()