Add content deduplication check for document insertion endpoints
• Check content hash before insertion • Return duplicated status if exists • Use sanitized text for hash computation • Apply to both single and batch inserts • Prevent duplicate content processing
This commit is contained in:
parent
8d28b95966
commit
19c16bc464
1 changed files with 34 additions and 1 deletions
|
|
@ -24,7 +24,11 @@ from pydantic import BaseModel, Field, field_validator
|
|||
|
||||
from lightrag import LightRAG
|
||||
from lightrag.base import DeletionResult, DocProcessingStatus, DocStatus
|
||||
from lightrag.utils import generate_track_id
|
||||
from lightrag.utils import (
|
||||
generate_track_id,
|
||||
compute_mdhash_id,
|
||||
sanitize_text_for_encoding,
|
||||
)
|
||||
from lightrag.api.utils_api import get_combined_auth_dependency
|
||||
from ..config import global_args
|
||||
|
||||
|
|
@ -2179,6 +2183,20 @@ def create_document_routes(
|
|||
track_id=existing_track_id,
|
||||
)
|
||||
|
||||
# Check if content already exists by computing content hash (doc_id)
|
||||
sanitized_text = sanitize_text_for_encoding(request.text)
|
||||
content_doc_id = compute_mdhash_id(sanitized_text, prefix="doc-")
|
||||
existing_doc = await rag.doc_status.get_by_id(content_doc_id)
|
||||
if existing_doc:
|
||||
# Content already exists, return duplicated with existing track_id
|
||||
status = existing_doc.get("status", "unknown")
|
||||
existing_track_id = existing_doc.get("track_id") or ""
|
||||
return InsertResponse(
|
||||
status="duplicated",
|
||||
message=f"Identical content already exists in document storage (doc_id: {content_doc_id}, Status: {status}).",
|
||||
track_id=existing_track_id,
|
||||
)
|
||||
|
||||
# Generate track_id for text insertion
|
||||
track_id = generate_track_id("insert")
|
||||
|
||||
|
|
@ -2247,6 +2265,21 @@ def create_document_routes(
|
|||
track_id=existing_track_id,
|
||||
)
|
||||
|
||||
# Check if any content already exists by computing content hash (doc_id)
|
||||
for text in request.texts:
|
||||
sanitized_text = sanitize_text_for_encoding(text)
|
||||
content_doc_id = compute_mdhash_id(sanitized_text, prefix="doc-")
|
||||
existing_doc = await rag.doc_status.get_by_id(content_doc_id)
|
||||
if existing_doc:
|
||||
# Content already exists, return duplicated with existing track_id
|
||||
status = existing_doc.get("status", "unknown")
|
||||
existing_track_id = existing_doc.get("track_id") or ""
|
||||
return InsertResponse(
|
||||
status="duplicated",
|
||||
message=f"Identical content already exists in document storage (doc_id: {content_doc_id}, Status: {status}).",
|
||||
track_id=existing_track_id,
|
||||
)
|
||||
|
||||
# Generate track_id for texts insertion
|
||||
track_id = generate_track_id("insert")
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue