Remove content fallback and standardize doc status handling
- Remove content_summary fallback logic - Standardize doc status processing - Handle missing file_path consistently
This commit is contained in:
parent
24c36d876c
commit
92bbb7a1b3
2 changed files with 13 additions and 16 deletions
|
|
@ -95,9 +95,6 @@ class JsonDocStatusStorage(DocStatusStorage):
|
||||||
try:
|
try:
|
||||||
# Make a copy of the data to avoid modifying the original
|
# Make a copy of the data to avoid modifying the original
|
||||||
data = v.copy()
|
data = v.copy()
|
||||||
# If content is missing, use content_summary as content
|
|
||||||
if "content" not in data and "content_summary" in data:
|
|
||||||
data["content"] = data["content_summary"]
|
|
||||||
# If file_path is not in data, use document id as file path
|
# If file_path is not in data, use document id as file path
|
||||||
if "file_path" not in data:
|
if "file_path" not in data:
|
||||||
data["file_path"] = "no-file-path"
|
data["file_path"] = "no-file-path"
|
||||||
|
|
|
||||||
|
|
@ -372,19 +372,19 @@ class MongoDocStatusStorage(DocStatusStorage):
|
||||||
"""Get all documents with a specific status"""
|
"""Get all documents with a specific status"""
|
||||||
cursor = self._data.find({"status": status.value})
|
cursor = self._data.find({"status": status.value})
|
||||||
result = await cursor.to_list()
|
result = await cursor.to_list()
|
||||||
return {
|
processed_result = {}
|
||||||
doc["_id"]: DocProcessingStatus(
|
for doc in result:
|
||||||
content_summary=doc.get("content_summary"),
|
try:
|
||||||
content_length=doc["content_length"],
|
# Make a copy of the data to avoid modifying the original
|
||||||
file_path=doc.get("file_path", doc["_id"]),
|
data = doc.copy()
|
||||||
status=doc["status"],
|
# If file_path is not in data, use document id as file path
|
||||||
created_at=doc.get("created_at"),
|
if "file_path" not in data:
|
||||||
updated_at=doc.get("updated_at"),
|
data["file_path"] = "no-file-path"
|
||||||
chunks_count=doc.get("chunks_count", -1),
|
processed_result[doc["_id"]] = DocProcessingStatus(**data)
|
||||||
chunks_list=doc.get("chunks_list", []),
|
except KeyError as e:
|
||||||
)
|
logger.error(f"Missing required field for document {doc['_id']}: {e}")
|
||||||
for doc in result
|
continue
|
||||||
}
|
return processed_result
|
||||||
|
|
||||||
async def index_done_callback(self) -> None:
|
async def index_done_callback(self) -> None:
|
||||||
# Mongo handles persistence automatically
|
# Mongo handles persistence automatically
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue