diff --git a/lightrag/kg/json_doc_status_impl.py b/lightrag/kg/json_doc_status_impl.py index 317509b3..46673521 100644 --- a/lightrag/kg/json_doc_status_impl.py +++ b/lightrag/kg/json_doc_status_impl.py @@ -95,9 +95,6 @@ class JsonDocStatusStorage(DocStatusStorage): try: # Make a copy of the data to avoid modifying the original data = v.copy() - # If content is missing, use content_summary as content - if "content" not in data and "content_summary" in data: - data["content"] = data["content_summary"] # If file_path is not in data, use document id as file path if "file_path" not in data: data["file_path"] = "no-file-path" diff --git a/lightrag/kg/mongo_impl.py b/lightrag/kg/mongo_impl.py index 5ecbedfd..1d6dc04b 100644 --- a/lightrag/kg/mongo_impl.py +++ b/lightrag/kg/mongo_impl.py @@ -372,19 +372,19 @@ class MongoDocStatusStorage(DocStatusStorage): """Get all documents with a specific status""" cursor = self._data.find({"status": status.value}) result = await cursor.to_list() - return { - doc["_id"]: DocProcessingStatus( - content_summary=doc.get("content_summary"), - content_length=doc["content_length"], - file_path=doc.get("file_path", doc["_id"]), - status=doc["status"], - created_at=doc.get("created_at"), - updated_at=doc.get("updated_at"), - chunks_count=doc.get("chunks_count", -1), - chunks_list=doc.get("chunks_list", []), - ) - for doc in result - } + processed_result = {} + for doc in result: + try: + # Make a copy of the data to avoid modifying the original + data = doc.copy() + # If file_path is not in data, use document id as file path + if "file_path" not in data: + data["file_path"] = "no-file-path" + processed_result[doc["_id"]] = DocProcessingStatus(**data) + except KeyError as e: + logger.error(f"Missing required field for document {doc['_id']}: {e}") + continue + return processed_result async def index_done_callback(self) -> None: # Mongo handles persistence automatically