From 8274ed52d1e0a7615211e07b3774d89ebeda65d0 Mon Sep 17 00:00:00 2001 From: yangdx Date: Tue, 29 Jul 2025 14:20:07 +0800 Subject: [PATCH 1/4] feat: separate document content from doc_status to improve performance This optimization significantly improves doc_status query/update performance by avoiding large string operations during frequent status checks. --- lightrag/lightrag.py | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 5df36bbb..336604e1 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -863,11 +863,10 @@ class LightRAG: for content, (id_, file_path) in unique_contents.items() } - # 3. Generate document initial status + # 3. Generate document initial status (without content) new_docs: dict[str, Any] = { id_: { "status": DocStatus.PENDING, - "content": content_data["content"], "content_summary": get_content_summary(content_data["content"]), "content_length": len(content_data["content"]), "created_at": datetime.now(timezone.utc).isoformat(), @@ -907,7 +906,15 @@ class LightRAG: logger.info("No new unique documents were found.") return - # 5. Store status document + # 5. Store document content in full_docs and status in doc_status + # Store full document content separately + full_docs_data = { + doc_id: {"content": contents[doc_id]["content"]} + for doc_id in new_docs.keys() + } + await self.full_docs.upsert(full_docs_data) + + # Store document status (without content) await self.doc_status.upsert(new_docs) logger.info(f"Stored {len(new_docs)} new unique documents") @@ -1049,6 +1056,14 @@ class LightRAG: pipeline_status["latest_message"] = log_message pipeline_status["history_messages"].append(log_message) + # Get document content from full_docs + content_data = await self.full_docs.get_by_id(doc_id) + if not content_data: + raise Exception( + f"Document content not found in full_docs for doc_id: {doc_id}" + ) + content = content_data["content"] + # Generate chunks from document chunks: dict[str, Any] = { compute_mdhash_id(dp["content"], prefix="chunk-"): { @@ -1059,7 +1074,7 @@ class LightRAG: } for dp in self.chunking_func( self.tokenizer, - status_doc.content, + content, split_by_character, split_by_character_only, self.chunk_overlap_token_size, @@ -1081,7 +1096,6 @@ class LightRAG: "chunks_list": list( chunks.keys() ), # Save chunks list - "content": status_doc.content, "content_summary": status_doc.content_summary, "content_length": status_doc.content_length, "created_at": status_doc.created_at, @@ -1096,11 +1110,6 @@ class LightRAG: chunks_vdb_task = asyncio.create_task( self.chunks_vdb.upsert(chunks) ) - full_docs_task = asyncio.create_task( - self.full_docs.upsert( - {doc_id: {"content": status_doc.content}} - ) - ) text_chunks_task = asyncio.create_task( self.text_chunks.upsert(chunks) ) @@ -1109,7 +1118,6 @@ class LightRAG: first_stage_tasks = [ doc_status_task, chunks_vdb_task, - full_docs_task, text_chunks_task, ] entity_relation_task = None @@ -1158,7 +1166,6 @@ class LightRAG: doc_id: { "status": DocStatus.FAILED, "error": str(e), - "content": status_doc.content, "content_summary": status_doc.content_summary, "content_length": status_doc.content_length, "created_at": status_doc.created_at, @@ -1197,7 +1204,6 @@ class LightRAG: "chunks_list": list( chunks.keys() ), # 保留 chunks_list - "content": status_doc.content, "content_summary": status_doc.content_summary, "content_length": status_doc.content_length, "created_at": status_doc.created_at, @@ -1244,7 +1250,6 @@ class LightRAG: doc_id: { "status": DocStatus.FAILED, "error": str(e), - "content": status_doc.content, "content_summary": status_doc.content_summary, "content_length": status_doc.content_length, "created_at": status_doc.created_at, From 24c36d876ce90d7ce0a92b806c88350567371492 Mon Sep 17 00:00:00 2001 From: yangdx Date: Tue, 29 Jul 2025 14:52:45 +0800 Subject: [PATCH 2/4] Remove content field from DocProcessingStatus, update MongoDB and PostgreSQL implementation --- lightrag/base.py | 2 -- lightrag/kg/mongo_impl.py | 3 +-- lightrag/kg/postgres_impl.py | 14 +++++++------- 3 files changed, 8 insertions(+), 11 deletions(-) diff --git a/lightrag/base.py b/lightrag/base.py index ac0545ce..9af3250a 100644 --- a/lightrag/base.py +++ b/lightrag/base.py @@ -629,8 +629,6 @@ class DocStatus(str, Enum): class DocProcessingStatus: """Document processing status data structure""" - content: str - """Original content of the document""" content_summary: str """First 100 chars of document content, used for preview""" content_length: int diff --git a/lightrag/kg/mongo_impl.py b/lightrag/kg/mongo_impl.py index dcf99327..5ecbedfd 100644 --- a/lightrag/kg/mongo_impl.py +++ b/lightrag/kg/mongo_impl.py @@ -374,14 +374,13 @@ class MongoDocStatusStorage(DocStatusStorage): result = await cursor.to_list() return { doc["_id"]: DocProcessingStatus( - content=doc["content"], content_summary=doc.get("content_summary"), content_length=doc["content_length"], + file_path=doc.get("file_path", doc["_id"]), status=doc["status"], created_at=doc.get("created_at"), updated_at=doc.get("updated_at"), chunks_count=doc.get("chunks_count", -1), - file_path=doc.get("file_path", doc["_id"]), chunks_list=doc.get("chunks_list", []), ) for doc in result diff --git a/lightrag/kg/postgres_impl.py b/lightrag/kg/postgres_impl.py index 0e49a67f..d2b170bf 100644 --- a/lightrag/kg/postgres_impl.py +++ b/lightrag/kg/postgres_impl.py @@ -1673,7 +1673,7 @@ class PGDocStatusStorage(DocStatusStorage): updated_at = self._format_datetime_with_timezone(result[0]["updated_at"]) return dict( - content=result[0]["content"], + # content=result[0]["content"], content_length=result[0]["content_length"], content_summary=result[0]["content_summary"], status=result[0]["status"], @@ -1713,7 +1713,7 @@ class PGDocStatusStorage(DocStatusStorage): processed_results.append( { - "content": row["content"], + # "content": row["content"], "content_length": row["content_length"], "content_summary": row["content_summary"], "status": row["status"], @@ -1762,7 +1762,7 @@ class PGDocStatusStorage(DocStatusStorage): updated_at = self._format_datetime_with_timezone(element["updated_at"]) docs_by_status[element["id"]] = DocProcessingStatus( - content=element["content"], + # content=element["content"], content_summary=element["content_summary"], content_length=element["content_length"], status=element["status"], @@ -1845,10 +1845,9 @@ class PGDocStatusStorage(DocStatusStorage): # Modified SQL to include created_at, updated_at, and chunks_list in both INSERT and UPDATE operations # All fields are updated from the input data in both INSERT and UPDATE cases - sql = """insert into LIGHTRAG_DOC_STATUS(workspace,id,content,content_summary,content_length,chunks_count,status,file_path,chunks_list,created_at,updated_at) - values($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11) + sql = """insert into LIGHTRAG_DOC_STATUS(workspace,id,content_summary,content_length,chunks_count,status,file_path,chunks_list,created_at,updated_at) + values($1,$2,$3,$4,$5,$6,$7,$8,$9,$10) on conflict(id,workspace) do update set - content = EXCLUDED.content, content_summary = EXCLUDED.content_summary, content_length = EXCLUDED.content_length, chunks_count = EXCLUDED.chunks_count, @@ -1868,7 +1867,7 @@ class PGDocStatusStorage(DocStatusStorage): { "workspace": self.db.workspace, "id": k, - "content": v["content"], + # "content": v["content"], "content_summary": v["content_summary"], "content_length": v["content_length"], "chunks_count": v["chunks_count"] if "chunks_count" in v else -1, @@ -3364,6 +3363,7 @@ TABLES = { CONSTRAINT LIGHTRAG_LLM_CACHE_PK PRIMARY KEY (workspace, mode, id) )""" }, + # content column in LIGHTRAG_DOC_STATUS is deprecated, use the same column in LIGHTRAG_DOC_FULL instead "LIGHTRAG_DOC_STATUS": { "ddl": """CREATE TABLE LIGHTRAG_DOC_STATUS ( workspace varchar(255) NOT NULL, From 92bbb7a1b3f5d66141ff3673b9b0bd6424015f56 Mon Sep 17 00:00:00 2001 From: yangdx Date: Tue, 29 Jul 2025 16:13:51 +0800 Subject: [PATCH 3/4] Remove content fallback and standardize doc status handling - Remove content_summary fallback logic - Standardize doc status processing - Handle missing file_path consistently --- lightrag/kg/json_doc_status_impl.py | 3 --- lightrag/kg/mongo_impl.py | 26 +++++++++++++------------- 2 files changed, 13 insertions(+), 16 deletions(-) diff --git a/lightrag/kg/json_doc_status_impl.py b/lightrag/kg/json_doc_status_impl.py index 317509b3..46673521 100644 --- a/lightrag/kg/json_doc_status_impl.py +++ b/lightrag/kg/json_doc_status_impl.py @@ -95,9 +95,6 @@ class JsonDocStatusStorage(DocStatusStorage): try: # Make a copy of the data to avoid modifying the original data = v.copy() - # If content is missing, use content_summary as content - if "content" not in data and "content_summary" in data: - data["content"] = data["content_summary"] # If file_path is not in data, use document id as file path if "file_path" not in data: data["file_path"] = "no-file-path" diff --git a/lightrag/kg/mongo_impl.py b/lightrag/kg/mongo_impl.py index 5ecbedfd..1d6dc04b 100644 --- a/lightrag/kg/mongo_impl.py +++ b/lightrag/kg/mongo_impl.py @@ -372,19 +372,19 @@ class MongoDocStatusStorage(DocStatusStorage): """Get all documents with a specific status""" cursor = self._data.find({"status": status.value}) result = await cursor.to_list() - return { - doc["_id"]: DocProcessingStatus( - content_summary=doc.get("content_summary"), - content_length=doc["content_length"], - file_path=doc.get("file_path", doc["_id"]), - status=doc["status"], - created_at=doc.get("created_at"), - updated_at=doc.get("updated_at"), - chunks_count=doc.get("chunks_count", -1), - chunks_list=doc.get("chunks_list", []), - ) - for doc in result - } + processed_result = {} + for doc in result: + try: + # Make a copy of the data to avoid modifying the original + data = doc.copy() + # If file_path is not in data, use document id as file path + if "file_path" not in data: + data["file_path"] = "no-file-path" + processed_result[doc["_id"]] = DocProcessingStatus(**data) + except KeyError as e: + logger.error(f"Missing required field for document {doc['_id']}: {e}") + continue + return processed_result async def index_done_callback(self) -> None: # Mongo handles persistence automatically From dafdf92715fe1ea1dfc8a4b999325d266b39ea9c Mon Sep 17 00:00:00 2001 From: yangdx Date: Tue, 29 Jul 2025 19:13:07 +0800 Subject: [PATCH 4/4] Remove content fallback logic in get_docs_by_status from Redis --- lightrag/kg/redis_impl.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/lightrag/kg/redis_impl.py b/lightrag/kg/redis_impl.py index 31752d69..6518cf04 100644 --- a/lightrag/kg/redis_impl.py +++ b/lightrag/kg/redis_impl.py @@ -786,12 +786,6 @@ class RedisDocStatusStorage(DocStatusStorage): # Make a copy of the data to avoid modifying the original data = doc_data.copy() - # If content is missing, use content_summary as content - if ( - "content" not in data - and "content_summary" in data - ): - data["content"] = data["content_summary"] # If file_path is not in data, use document id as file path if "file_path" not in data: data["file_path"] = "no-file-path"