Fix duplicate document responses to return original track_id
- Return existing track_id for duplicates
- Remove track_id generation in reprocess
- Update reprocess response documentation
- Clarify track_id behavior in comments
- Update API response examples
(cherry picked from commit 8d28b95966)
This commit is contained in:
parent
7e591a81c0
commit
d0e3c8a4a3
1 changed files with 25 additions and 20 deletions
|
|
@ -163,7 +163,7 @@ class ReprocessResponse(BaseModel):
|
|||
Attributes:
|
||||
status: Status of the reprocessing operation
|
||||
message: Message describing the operation result
|
||||
track_id: Tracking ID for monitoring reprocessing progress
|
||||
track_id: Always empty string. Reprocessed documents retain their original track_id.
|
||||
"""
|
||||
|
||||
status: Literal["reprocessing_started"] = Field(
|
||||
|
|
@ -171,7 +171,8 @@ class ReprocessResponse(BaseModel):
|
|||
)
|
||||
message: str = Field(description="Human-readable message describing the operation")
|
||||
track_id: str = Field(
|
||||
description="Tracking ID for monitoring reprocessing progress"
|
||||
default="",
|
||||
description="Always empty string. Reprocessed documents retain their original track_id from initial upload.",
|
||||
)
|
||||
|
||||
class Config:
|
||||
|
|
@ -179,7 +180,7 @@ class ReprocessResponse(BaseModel):
|
|||
"example": {
|
||||
"status": "reprocessing_started",
|
||||
"message": "Reprocessing of failed documents has been initiated in background",
|
||||
"track_id": "retry_20250729_170612_def456",
|
||||
"track_id": "",
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -1948,12 +1949,14 @@ def create_document_routes(
|
|||
# Check if filename already exists in doc_status storage
|
||||
existing_doc_data = await rag.doc_status.get_doc_by_file_path(safe_filename)
|
||||
if existing_doc_data:
|
||||
# Get document status information for error message
|
||||
# Get document status and track_id from existing document
|
||||
status = existing_doc_data.get("status", "unknown")
|
||||
# Use `or ""` to handle both missing key and None value (e.g., legacy rows without track_id)
|
||||
existing_track_id = existing_doc_data.get("track_id") or ""
|
||||
return InsertResponse(
|
||||
status="duplicated",
|
||||
message=f"File '{safe_filename}' already exists in document storage (Status: {status}).",
|
||||
track_id="",
|
||||
track_id=existing_track_id,
|
||||
)
|
||||
|
||||
file_path = doc_manager.input_dir / safe_filename
|
||||
|
|
@ -2017,12 +2020,14 @@ def create_document_routes(
|
|||
request.file_source
|
||||
)
|
||||
if existing_doc_data:
|
||||
# Get document status information for error message
|
||||
# Get document status and track_id from existing document
|
||||
status = existing_doc_data.get("status", "unknown")
|
||||
# Use `or ""` to handle both missing key and None value (e.g., legacy rows without track_id)
|
||||
existing_track_id = existing_doc_data.get("track_id") or ""
|
||||
return InsertResponse(
|
||||
status="duplicated",
|
||||
message=f"File source '{request.file_source}' already exists in document storage (Status: {status}).",
|
||||
track_id="",
|
||||
track_id=existing_track_id,
|
||||
)
|
||||
|
||||
# Check if content already exists by computing content hash (doc_id)
|
||||
|
|
@ -2097,12 +2102,14 @@ def create_document_routes(
|
|||
file_source
|
||||
)
|
||||
if existing_doc_data:
|
||||
# Get document status information for error message
|
||||
# Get document status and track_id from existing document
|
||||
status = existing_doc_data.get("status", "unknown")
|
||||
# Use `or ""` to handle both missing key and None value (e.g., legacy rows without track_id)
|
||||
existing_track_id = existing_doc_data.get("track_id") or ""
|
||||
return InsertResponse(
|
||||
status="duplicated",
|
||||
message=f"File source '{file_source}' already exists in document storage (Status: {status}).",
|
||||
track_id="",
|
||||
track_id=existing_track_id,
|
||||
)
|
||||
|
||||
# Check if any content already exists by computing content hash (doc_id)
|
||||
|
|
@ -2926,29 +2933,27 @@ def create_document_routes(
|
|||
This is useful for recovering from server crashes, network errors, LLM service
|
||||
outages, or other temporary failures that caused document processing to fail.
|
||||
|
||||
The processing happens in the background and can be monitored using the
|
||||
returned track_id or by checking the pipeline status.
|
||||
The processing happens in the background and can be monitored by checking the
|
||||
pipeline status. The reprocessed documents retain their original track_id from
|
||||
initial upload, so use their original track_id to monitor progress.
|
||||
|
||||
Returns:
|
||||
ReprocessResponse: Response with status, message, and track_id
|
||||
ReprocessResponse: Response with status and message.
|
||||
track_id is always empty string because reprocessed documents retain
|
||||
their original track_id from initial upload.
|
||||
|
||||
Raises:
|
||||
HTTPException: If an error occurs while initiating reprocessing (500).
|
||||
"""
|
||||
try:
|
||||
# Generate track_id with "retry" prefix for retry operation
|
||||
track_id = generate_track_id("retry")
|
||||
|
||||
# Start the reprocessing in the background
|
||||
# Note: Reprocessed documents retain their original track_id from initial upload
|
||||
background_tasks.add_task(rag.apipeline_process_enqueue_documents)
|
||||
logger.info(
|
||||
f"Reprocessing of failed documents initiated with track_id: {track_id}"
|
||||
)
|
||||
logger.info("Reprocessing of failed documents initiated")
|
||||
|
||||
return ReprocessResponse(
|
||||
status="reprocessing_started",
|
||||
message="Reprocessing of failed documents has been initiated in background",
|
||||
track_id=track_id,
|
||||
message="Reprocessing of failed documents has been initiated in background. Documents retain their original track_id.",
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue