Add content deduplication check for document insertion endpoints
• Check content hash before insertion • Return duplicated status if exists • Use sanitized text for hash computation • Apply to both single and batch inserts • Prevent duplicate content processing
This commit is contained in:
parent
8d28b95966
commit
19c16bc464
1 changed files with 34 additions and 1 deletions
|
|
@ -24,7 +24,11 @@ from pydantic import BaseModel, Field, field_validator
|
||||||
|
|
||||||
from lightrag import LightRAG
|
from lightrag import LightRAG
|
||||||
from lightrag.base import DeletionResult, DocProcessingStatus, DocStatus
|
from lightrag.base import DeletionResult, DocProcessingStatus, DocStatus
|
||||||
from lightrag.utils import generate_track_id
|
from lightrag.utils import (
|
||||||
|
generate_track_id,
|
||||||
|
compute_mdhash_id,
|
||||||
|
sanitize_text_for_encoding,
|
||||||
|
)
|
||||||
from lightrag.api.utils_api import get_combined_auth_dependency
|
from lightrag.api.utils_api import get_combined_auth_dependency
|
||||||
from ..config import global_args
|
from ..config import global_args
|
||||||
|
|
||||||
|
|
@ -2179,6 +2183,20 @@ def create_document_routes(
|
||||||
track_id=existing_track_id,
|
track_id=existing_track_id,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Check if content already exists by computing content hash (doc_id)
|
||||||
|
sanitized_text = sanitize_text_for_encoding(request.text)
|
||||||
|
content_doc_id = compute_mdhash_id(sanitized_text, prefix="doc-")
|
||||||
|
existing_doc = await rag.doc_status.get_by_id(content_doc_id)
|
||||||
|
if existing_doc:
|
||||||
|
# Content already exists, return duplicated with existing track_id
|
||||||
|
status = existing_doc.get("status", "unknown")
|
||||||
|
existing_track_id = existing_doc.get("track_id") or ""
|
||||||
|
return InsertResponse(
|
||||||
|
status="duplicated",
|
||||||
|
message=f"Identical content already exists in document storage (doc_id: {content_doc_id}, Status: {status}).",
|
||||||
|
track_id=existing_track_id,
|
||||||
|
)
|
||||||
|
|
||||||
# Generate track_id for text insertion
|
# Generate track_id for text insertion
|
||||||
track_id = generate_track_id("insert")
|
track_id = generate_track_id("insert")
|
||||||
|
|
||||||
|
|
@ -2247,6 +2265,21 @@ def create_document_routes(
|
||||||
track_id=existing_track_id,
|
track_id=existing_track_id,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Check if any content already exists by computing content hash (doc_id)
|
||||||
|
for text in request.texts:
|
||||||
|
sanitized_text = sanitize_text_for_encoding(text)
|
||||||
|
content_doc_id = compute_mdhash_id(sanitized_text, prefix="doc-")
|
||||||
|
existing_doc = await rag.doc_status.get_by_id(content_doc_id)
|
||||||
|
if existing_doc:
|
||||||
|
# Content already exists, return duplicated with existing track_id
|
||||||
|
status = existing_doc.get("status", "unknown")
|
||||||
|
existing_track_id = existing_doc.get("track_id") or ""
|
||||||
|
return InsertResponse(
|
||||||
|
status="duplicated",
|
||||||
|
message=f"Identical content already exists in document storage (doc_id: {content_doc_id}, Status: {status}).",
|
||||||
|
track_id=existing_track_id,
|
||||||
|
)
|
||||||
|
|
||||||
# Generate track_id for texts insertion
|
# Generate track_id for texts insertion
|
||||||
track_id = generate_track_id("insert")
|
track_id = generate_track_id("insert")
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue