Remove content fallback and standardize doc status handling

- Remove content_summary fallback logic
- Standardize doc status processing
- Handle missing file_path consistently
This commit is contained in:
yangdx 2025-07-29 16:13:51 +08:00
parent 24c36d876c
commit 92bbb7a1b3
2 changed files with 13 additions and 16 deletions

View file

@ -95,9 +95,6 @@ class JsonDocStatusStorage(DocStatusStorage):
try:
# Make a copy of the data to avoid modifying the original
data = v.copy()
# If content is missing, use content_summary as content
if "content" not in data and "content_summary" in data:
data["content"] = data["content_summary"]
# If file_path is not in data, use document id as file path
if "file_path" not in data:
data["file_path"] = "no-file-path"

View file

@ -372,19 +372,19 @@ class MongoDocStatusStorage(DocStatusStorage):
"""Get all documents with a specific status"""
cursor = self._data.find({"status": status.value})
result = await cursor.to_list()
return {
doc["_id"]: DocProcessingStatus(
content_summary=doc.get("content_summary"),
content_length=doc["content_length"],
file_path=doc.get("file_path", doc["_id"]),
status=doc["status"],
created_at=doc.get("created_at"),
updated_at=doc.get("updated_at"),
chunks_count=doc.get("chunks_count", -1),
chunks_list=doc.get("chunks_list", []),
)
for doc in result
}
processed_result = {}
for doc in result:
try:
# Make a copy of the data to avoid modifying the original
data = doc.copy()
# If file_path is not in data, use document id as file path
if "file_path" not in data:
data["file_path"] = "no-file-path"
processed_result[doc["_id"]] = DocProcessingStatus(**data)
except KeyError as e:
logger.error(f"Missing required field for document {doc['_id']}: {e}")
continue
return processed_result
async def index_done_callback(self) -> None:
# Mongo handles persistence automatically