Remove content fallback and standardize doc status handling

- Remove content_summary fallback logic
- Standardize doc status processing
- Handle missing file_path consistently
This commit is contained in:
yangdx 2025-07-29 16:13:51 +08:00
parent 24c36d876c
commit 92bbb7a1b3
2 changed files with 13 additions and 16 deletions

View file

@ -95,9 +95,6 @@ class JsonDocStatusStorage(DocStatusStorage):
try: try:
# Make a copy of the data to avoid modifying the original # Make a copy of the data to avoid modifying the original
data = v.copy() data = v.copy()
# If content is missing, use content_summary as content
if "content" not in data and "content_summary" in data:
data["content"] = data["content_summary"]
# If file_path is not in data, use document id as file path # If file_path is not in data, use document id as file path
if "file_path" not in data: if "file_path" not in data:
data["file_path"] = "no-file-path" data["file_path"] = "no-file-path"

View file

@ -372,19 +372,19 @@ class MongoDocStatusStorage(DocStatusStorage):
"""Get all documents with a specific status""" """Get all documents with a specific status"""
cursor = self._data.find({"status": status.value}) cursor = self._data.find({"status": status.value})
result = await cursor.to_list() result = await cursor.to_list()
return { processed_result = {}
doc["_id"]: DocProcessingStatus( for doc in result:
content_summary=doc.get("content_summary"), try:
content_length=doc["content_length"], # Make a copy of the data to avoid modifying the original
file_path=doc.get("file_path", doc["_id"]), data = doc.copy()
status=doc["status"], # If file_path is not in data, use document id as file path
created_at=doc.get("created_at"), if "file_path" not in data:
updated_at=doc.get("updated_at"), data["file_path"] = "no-file-path"
chunks_count=doc.get("chunks_count", -1), processed_result[doc["_id"]] = DocProcessingStatus(**data)
chunks_list=doc.get("chunks_list", []), except KeyError as e:
) logger.error(f"Missing required field for document {doc['_id']}: {e}")
for doc in result continue
} return processed_result
async def index_done_callback(self) -> None: async def index_done_callback(self) -> None:
# Mongo handles persistence automatically # Mongo handles persistence automatically