Add content deduplication check for document insertion endpoints

• Check content hash before insertion
• Return duplicated status if exists
• Use sanitized text for hash computation
• Apply to both single and batch inserts
• Prevent duplicate content processing
This commit is contained in:
yangdx 2025-12-02 17:49:48 +08:00
parent 8d28b95966
commit 19c16bc464

View file

@ -24,7 +24,11 @@ from pydantic import BaseModel, Field, field_validator
from lightrag import LightRAG from lightrag import LightRAG
from lightrag.base import DeletionResult, DocProcessingStatus, DocStatus from lightrag.base import DeletionResult, DocProcessingStatus, DocStatus
from lightrag.utils import generate_track_id from lightrag.utils import (
generate_track_id,
compute_mdhash_id,
sanitize_text_for_encoding,
)
from lightrag.api.utils_api import get_combined_auth_dependency from lightrag.api.utils_api import get_combined_auth_dependency
from ..config import global_args from ..config import global_args
@ -2179,6 +2183,20 @@ def create_document_routes(
track_id=existing_track_id, track_id=existing_track_id,
) )
# Check if content already exists by computing content hash (doc_id)
sanitized_text = sanitize_text_for_encoding(request.text)
content_doc_id = compute_mdhash_id(sanitized_text, prefix="doc-")
existing_doc = await rag.doc_status.get_by_id(content_doc_id)
if existing_doc:
# Content already exists, return duplicated with existing track_id
status = existing_doc.get("status", "unknown")
existing_track_id = existing_doc.get("track_id") or ""
return InsertResponse(
status="duplicated",
message=f"Identical content already exists in document storage (doc_id: {content_doc_id}, Status: {status}).",
track_id=existing_track_id,
)
# Generate track_id for text insertion # Generate track_id for text insertion
track_id = generate_track_id("insert") track_id = generate_track_id("insert")
@ -2247,6 +2265,21 @@ def create_document_routes(
track_id=existing_track_id, track_id=existing_track_id,
) )
# Check if any content already exists by computing content hash (doc_id)
for text in request.texts:
sanitized_text = sanitize_text_for_encoding(text)
content_doc_id = compute_mdhash_id(sanitized_text, prefix="doc-")
existing_doc = await rag.doc_status.get_by_id(content_doc_id)
if existing_doc:
# Content already exists, return duplicated with existing track_id
status = existing_doc.get("status", "unknown")
existing_track_id = existing_doc.get("track_id") or ""
return InsertResponse(
status="duplicated",
message=f"Identical content already exists in document storage (doc_id: {content_doc_id}, Status: {status}).",
track_id=existing_track_id,
)
# Generate track_id for texts insertion # Generate track_id for texts insertion
track_id = generate_track_id("insert") track_id = generate_track_id("insert")