From 8274ed52d1e0a7615211e07b3774d89ebeda65d0 Mon Sep 17 00:00:00 2001
From: yangdx <gzdaniel@me.com>
Date: Tue, 29 Jul 2025 14:20:07 +0800
Subject: [PATCH 1/4] feat: separate document content from doc_status to
 improve performance

This optimization significantly improves doc_status query/update performance by avoiding large string operations during frequent status checks.
---
 lightrag/lightrag.py | 33 +++++++++++++++++++--------------
 1 file changed, 19 insertions(+), 14 deletions(-)

diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py
index 5df36bbb..336604e1 100644
--- a/lightrag/lightrag.py
+++ b/lightrag/lightrag.py
@@ -863,11 +863,10 @@ class LightRAG:
             for content, (id_, file_path) in unique_contents.items()
         }
 
-        # 3. Generate document initial status
+        # 3. Generate document initial status (without content)
         new_docs: dict[str, Any] = {
             id_: {
                 "status": DocStatus.PENDING,
-                "content": content_data["content"],
                 "content_summary": get_content_summary(content_data["content"]),
                 "content_length": len(content_data["content"]),
                 "created_at": datetime.now(timezone.utc).isoformat(),
@@ -907,7 +906,15 @@ class LightRAG:
             logger.info("No new unique documents were found.")
             return
 
-        # 5. Store status document
+        # 5. Store document content in full_docs and status in doc_status
+        # Store full document content separately
+        full_docs_data = {
+            doc_id: {"content": contents[doc_id]["content"]}
+            for doc_id in new_docs.keys()
+        }
+        await self.full_docs.upsert(full_docs_data)
+
+        # Store document status (without content)
         await self.doc_status.upsert(new_docs)
         logger.info(f"Stored {len(new_docs)} new unique documents")
 
@@ -1049,6 +1056,14 @@ class LightRAG:
                                 pipeline_status["latest_message"] = log_message
                                 pipeline_status["history_messages"].append(log_message)
 
+                            # Get document content from full_docs
+                            content_data = await self.full_docs.get_by_id(doc_id)
+                            if not content_data:
+                                raise Exception(
+                                    f"Document content not found in full_docs for doc_id: {doc_id}"
+                                )
+                            content = content_data["content"]
+
                             # Generate chunks from document
                             chunks: dict[str, Any] = {
                                 compute_mdhash_id(dp["content"], prefix="chunk-"): {
@@ -1059,7 +1074,7 @@ class LightRAG:
                                 }
                                 for dp in self.chunking_func(
                                     self.tokenizer,
-                                    status_doc.content,
+                                    content,
                                     split_by_character,
                                     split_by_character_only,
                                     self.chunk_overlap_token_size,
@@ -1081,7 +1096,6 @@ class LightRAG:
                                             "chunks_list": list(
                                                 chunks.keys()
                                             ),  # Save chunks list
-                                            "content": status_doc.content,
                                             "content_summary": status_doc.content_summary,
                                             "content_length": status_doc.content_length,
                                             "created_at": status_doc.created_at,
@@ -1096,11 +1110,6 @@ class LightRAG:
                             chunks_vdb_task = asyncio.create_task(
                                 self.chunks_vdb.upsert(chunks)
                             )
-                            full_docs_task = asyncio.create_task(
-                                self.full_docs.upsert(
-                                    {doc_id: {"content": status_doc.content}}
-                                )
-                            )
                             text_chunks_task = asyncio.create_task(
                                 self.text_chunks.upsert(chunks)
                             )
@@ -1109,7 +1118,6 @@ class LightRAG:
                             first_stage_tasks = [
                                 doc_status_task,
                                 chunks_vdb_task,
-                                full_docs_task,
                                 text_chunks_task,
                             ]
                             entity_relation_task = None
@@ -1158,7 +1166,6 @@ class LightRAG:
                                     doc_id: {
                                         "status": DocStatus.FAILED,
                                         "error": str(e),
-                                        "content": status_doc.content,
                                         "content_summary": status_doc.content_summary,
                                         "content_length": status_doc.content_length,
                                         "created_at": status_doc.created_at,
@@ -1197,7 +1204,6 @@ class LightRAG:
                                             "chunks_list": list(
                                                 chunks.keys()
                                             ),  # 保留 chunks_list
-                                            "content": status_doc.content,
                                             "content_summary": status_doc.content_summary,
                                             "content_length": status_doc.content_length,
                                             "created_at": status_doc.created_at,
@@ -1244,7 +1250,6 @@ class LightRAG:
                                         doc_id: {
                                             "status": DocStatus.FAILED,
                                             "error": str(e),
-                                            "content": status_doc.content,
                                             "content_summary": status_doc.content_summary,
                                             "content_length": status_doc.content_length,
                                             "created_at": status_doc.created_at,

From 24c36d876ce90d7ce0a92b806c88350567371492 Mon Sep 17 00:00:00 2001
From: yangdx <gzdaniel@me.com>
Date: Tue, 29 Jul 2025 14:52:45 +0800
Subject: [PATCH 2/4] Remove content field from DocProcessingStatus, update
 MongoDB and PostgreSQL implementation

---
 lightrag/base.py             |  2 --
 lightrag/kg/mongo_impl.py    |  3 +--
 lightrag/kg/postgres_impl.py | 14 +++++++-------
 3 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/lightrag/base.py b/lightrag/base.py
index ac0545ce..9af3250a 100644
--- a/lightrag/base.py
+++ b/lightrag/base.py
@@ -629,8 +629,6 @@ class DocStatus(str, Enum):
 class DocProcessingStatus:
     """Document processing status data structure"""
 
-    content: str
-    """Original content of the document"""
     content_summary: str
     """First 100 chars of document content, used for preview"""
     content_length: int
diff --git a/lightrag/kg/mongo_impl.py b/lightrag/kg/mongo_impl.py
index dcf99327..5ecbedfd 100644
--- a/lightrag/kg/mongo_impl.py
+++ b/lightrag/kg/mongo_impl.py
@@ -374,14 +374,13 @@ class MongoDocStatusStorage(DocStatusStorage):
         result = await cursor.to_list()
         return {
             doc["_id"]: DocProcessingStatus(
-                content=doc["content"],
                 content_summary=doc.get("content_summary"),
                 content_length=doc["content_length"],
+                file_path=doc.get("file_path", doc["_id"]),
                 status=doc["status"],
                 created_at=doc.get("created_at"),
                 updated_at=doc.get("updated_at"),
                 chunks_count=doc.get("chunks_count", -1),
-                file_path=doc.get("file_path", doc["_id"]),
                 chunks_list=doc.get("chunks_list", []),
             )
             for doc in result
diff --git a/lightrag/kg/postgres_impl.py b/lightrag/kg/postgres_impl.py
index 0e49a67f..d2b170bf 100644
--- a/lightrag/kg/postgres_impl.py
+++ b/lightrag/kg/postgres_impl.py
@@ -1673,7 +1673,7 @@ class PGDocStatusStorage(DocStatusStorage):
             updated_at = self._format_datetime_with_timezone(result[0]["updated_at"])
 
             return dict(
-                content=result[0]["content"],
+                # content=result[0]["content"],
                 content_length=result[0]["content_length"],
                 content_summary=result[0]["content_summary"],
                 status=result[0]["status"],
@@ -1713,7 +1713,7 @@ class PGDocStatusStorage(DocStatusStorage):
 
             processed_results.append(
                 {
-                    "content": row["content"],
+                    # "content": row["content"],
                     "content_length": row["content_length"],
                     "content_summary": row["content_summary"],
                     "status": row["status"],
@@ -1762,7 +1762,7 @@ class PGDocStatusStorage(DocStatusStorage):
             updated_at = self._format_datetime_with_timezone(element["updated_at"])
 
             docs_by_status[element["id"]] = DocProcessingStatus(
-                content=element["content"],
+                # content=element["content"],
                 content_summary=element["content_summary"],
                 content_length=element["content_length"],
                 status=element["status"],
@@ -1845,10 +1845,9 @@ class PGDocStatusStorage(DocStatusStorage):
 
         # Modified SQL to include created_at, updated_at, and chunks_list in both INSERT and UPDATE operations
         # All fields are updated from the input data in both INSERT and UPDATE cases
-        sql = """insert into LIGHTRAG_DOC_STATUS(workspace,id,content,content_summary,content_length,chunks_count,status,file_path,chunks_list,created_at,updated_at)
-                 values($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11)
+        sql = """insert into LIGHTRAG_DOC_STATUS(workspace,id,content_summary,content_length,chunks_count,status,file_path,chunks_list,created_at,updated_at)
+                 values($1,$2,$3,$4,$5,$6,$7,$8,$9,$10)
                   on conflict(id,workspace) do update set
-                  content = EXCLUDED.content,
                   content_summary = EXCLUDED.content_summary,
                   content_length = EXCLUDED.content_length,
                   chunks_count = EXCLUDED.chunks_count,
@@ -1868,7 +1867,7 @@ class PGDocStatusStorage(DocStatusStorage):
                 {
                     "workspace": self.db.workspace,
                     "id": k,
-                    "content": v["content"],
+                    # "content": v["content"],
                     "content_summary": v["content_summary"],
                     "content_length": v["content_length"],
                     "chunks_count": v["chunks_count"] if "chunks_count" in v else -1,
@@ -3364,6 +3363,7 @@ TABLES = {
 	                CONSTRAINT LIGHTRAG_LLM_CACHE_PK PRIMARY KEY (workspace, mode, id)
                     )"""
     },
+    # content column in LIGHTRAG_DOC_STATUS is deprecated, use the same column in LIGHTRAG_DOC_FULL instead
     "LIGHTRAG_DOC_STATUS": {
         "ddl": """CREATE TABLE LIGHTRAG_DOC_STATUS (
 	               workspace varchar(255) NOT NULL,

From 92bbb7a1b3f5d66141ff3673b9b0bd6424015f56 Mon Sep 17 00:00:00 2001
From: yangdx <gzdaniel@me.com>
Date: Tue, 29 Jul 2025 16:13:51 +0800
Subject: [PATCH 3/4] Remove content fallback and standardize doc status
 handling

- Remove content_summary fallback logic
- Standardize doc status processing
- Handle missing file_path consistently
---
 lightrag/kg/json_doc_status_impl.py |  3 ---
 lightrag/kg/mongo_impl.py           | 26 +++++++++++++-------------
 2 files changed, 13 insertions(+), 16 deletions(-)

diff --git a/lightrag/kg/json_doc_status_impl.py b/lightrag/kg/json_doc_status_impl.py
index 317509b3..46673521 100644
--- a/lightrag/kg/json_doc_status_impl.py
+++ b/lightrag/kg/json_doc_status_impl.py
@@ -95,9 +95,6 @@ class JsonDocStatusStorage(DocStatusStorage):
                     try:
                         # Make a copy of the data to avoid modifying the original
                         data = v.copy()
-                        # If content is missing, use content_summary as content
-                        if "content" not in data and "content_summary" in data:
-                            data["content"] = data["content_summary"]
                         # If file_path is not in data, use document id as file path
                         if "file_path" not in data:
                             data["file_path"] = "no-file-path"
diff --git a/lightrag/kg/mongo_impl.py b/lightrag/kg/mongo_impl.py
index 5ecbedfd..1d6dc04b 100644
--- a/lightrag/kg/mongo_impl.py
+++ b/lightrag/kg/mongo_impl.py
@@ -372,19 +372,19 @@ class MongoDocStatusStorage(DocStatusStorage):
         """Get all documents with a specific status"""
         cursor = self._data.find({"status": status.value})
         result = await cursor.to_list()
-        return {
-            doc["_id"]: DocProcessingStatus(
-                content_summary=doc.get("content_summary"),
-                content_length=doc["content_length"],
-                file_path=doc.get("file_path", doc["_id"]),
-                status=doc["status"],
-                created_at=doc.get("created_at"),
-                updated_at=doc.get("updated_at"),
-                chunks_count=doc.get("chunks_count", -1),
-                chunks_list=doc.get("chunks_list", []),
-            )
-            for doc in result
-        }
+        processed_result = {}
+        for doc in result:
+            try:
+                # Make a copy of the data to avoid modifying the original
+                data = doc.copy()
+                # If file_path is not in data, use document id as file path
+                if "file_path" not in data:
+                    data["file_path"] = "no-file-path"
+                processed_result[doc["_id"]] = DocProcessingStatus(**data)
+            except KeyError as e:
+                logger.error(f"Missing required field for document {doc['_id']}: {e}")
+                continue
+        return processed_result
 
     async def index_done_callback(self) -> None:
         # Mongo handles persistence automatically

From dafdf92715fe1ea1dfc8a4b999325d266b39ea9c Mon Sep 17 00:00:00 2001
From: yangdx <gzdaniel@me.com>
Date: Tue, 29 Jul 2025 19:13:07 +0800
Subject: [PATCH 4/4] Remove content fallback logic in get_docs_by_status from
 Redis

---
 lightrag/kg/redis_impl.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/lightrag/kg/redis_impl.py b/lightrag/kg/redis_impl.py
index 31752d69..6518cf04 100644
--- a/lightrag/kg/redis_impl.py
+++ b/lightrag/kg/redis_impl.py
@@ -786,12 +786,6 @@ class RedisDocStatusStorage(DocStatusStorage):
 
                                         # Make a copy of the data to avoid modifying the original
                                         data = doc_data.copy()
-                                        # If content is missing, use content_summary as content
-                                        if (
-                                            "content" not in data
-                                            and "content_summary" in data
-                                        ):
-                                            data["content"] = data["content_summary"]
                                         # If file_path is not in data, use document id as file path
                                         if "file_path" not in data:
                                             data["file_path"] = "no-file-path"