fix: make ids parameter optional and optimize input text cleaning

- Add default None value for ids parameter - Move text cleaning into else branch - Only clean text when auto-generating ids - Preserve original text with custom ids - Improve code readability
2025-02-23 15:46:47 +08:00 · 2025-02-23 15:46:47 +08:00 · 845e914f1b
commit 845e914f1b
parent 411782797b
1 changed files with 3 additions and 4 deletions
--- a/lightrag/lightrag.py
+++ b/lightrag/lightrag.py
@ -577,7 +577,7 @@ class LightRAG:
                await self._insert_done()

    async def apipeline_enqueue_documents(
-        self, input: str | list[str], ids: list[str] | None
+        self, input: str | list[str], ids: list[str] | None = None
    ) -> None:
        """
        Pipeline for Processing Documents
@ -591,9 +591,6 @@ class LightRAG:
        if isinstance(input, str):
            input = [input]

-        # Clean input text and remove duplicates
-        input = list(set(self.clean_text(doc) for doc in input))
-
        # 1. Validate ids if provided or generate MD5 hash IDs
        if ids is not None:
            # Check if the number of IDs matches the number of documents
@ -607,6 +604,8 @@ class LightRAG:
            # Generate contents dict of IDs provided by user and documents
            contents = {id_: doc for id_, doc in zip(ids, input)}
        else:
+            # Clean input text and remove duplicates
+            input = list(set(self.clean_text(doc) for doc in input))
            # Generate contents dict of MD5 hash IDs and documents
            contents = {compute_mdhash_id(doc, prefix="doc-"): doc for doc in input}