From 19c16bc4644867b3c6f63e800b07b2081d83fb06 Mon Sep 17 00:00:00 2001 From: yangdx Date: Tue, 2 Dec 2025 17:49:48 +0800 Subject: [PATCH] Add content deduplication check for document insertion endpoints MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit • Check content hash before insertion • Return duplicated status if exists • Use sanitized text for hash computation • Apply to both single and batch inserts • Prevent duplicate content processing --- lightrag/api/routers/document_routes.py | 35 ++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/lightrag/api/routers/document_routes.py b/lightrag/api/routers/document_routes.py index 40bf2dba..d906aa5c 100644 --- a/lightrag/api/routers/document_routes.py +++ b/lightrag/api/routers/document_routes.py @@ -24,7 +24,11 @@ from pydantic import BaseModel, Field, field_validator from lightrag import LightRAG from lightrag.base import DeletionResult, DocProcessingStatus, DocStatus -from lightrag.utils import generate_track_id +from lightrag.utils import ( + generate_track_id, + compute_mdhash_id, + sanitize_text_for_encoding, +) from lightrag.api.utils_api import get_combined_auth_dependency from ..config import global_args @@ -2179,6 +2183,20 @@ def create_document_routes( track_id=existing_track_id, ) + # Check if content already exists by computing content hash (doc_id) + sanitized_text = sanitize_text_for_encoding(request.text) + content_doc_id = compute_mdhash_id(sanitized_text, prefix="doc-") + existing_doc = await rag.doc_status.get_by_id(content_doc_id) + if existing_doc: + # Content already exists, return duplicated with existing track_id + status = existing_doc.get("status", "unknown") + existing_track_id = existing_doc.get("track_id") or "" + return InsertResponse( + status="duplicated", + message=f"Identical content already exists in document storage (doc_id: {content_doc_id}, Status: {status}).", + track_id=existing_track_id, + ) + # Generate track_id for text insertion track_id = generate_track_id("insert") @@ -2247,6 +2265,21 @@ def create_document_routes( track_id=existing_track_id, ) + # Check if any content already exists by computing content hash (doc_id) + for text in request.texts: + sanitized_text = sanitize_text_for_encoding(text) + content_doc_id = compute_mdhash_id(sanitized_text, prefix="doc-") + existing_doc = await rag.doc_status.get_by_id(content_doc_id) + if existing_doc: + # Content already exists, return duplicated with existing track_id + status = existing_doc.get("status", "unknown") + existing_track_id = existing_doc.get("track_id") or "" + return InsertResponse( + status="duplicated", + message=f"Identical content already exists in document storage (doc_id: {content_doc_id}, Status: {status}).", + track_id=existing_track_id, + ) + # Generate track_id for texts insertion track_id = generate_track_id("insert")