From 54f0a7d1ca09fb7297ff90d1a9962f2087a7c71a Mon Sep 17 00:00:00 2001
From: DivinesLight <munir@loop.pk>
Date: Tue, 14 Oct 2025 14:47:04 +0500
Subject: [PATCH 01/25] Quick fix to limit source_id ballooning while inserting
 nodes

---
 lightrag/constants.py |  1 +
 lightrag/operate.py   | 15 +++++++++++----
 lightrag/utils.py     | 15 +++++++++++++++
 3 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/lightrag/constants.py b/lightrag/constants.py
index 14584559..6fb9feb4 100644
--- a/lightrag/constants.py
+++ b/lightrag/constants.py
@@ -13,6 +13,7 @@ DEFAULT_MAX_GRAPH_NODES = 1000
 # Default values for extraction settings
 DEFAULT_SUMMARY_LANGUAGE = "English"  # Default language for document processing
 DEFAULT_MAX_GLEANING = 1
+DEFAULT_MAX_CHUNK_IDS_PER_ENTITY = 500 # Applies to Both Graph + Vector DBs
 
 # Number of description fragments to trigger LLM summary
 DEFAULT_FORCE_LLM_SUMMARY_ON_MERGE = 8
diff --git a/lightrag/operate.py b/lightrag/operate.py
index a12cb63f..29a17e68 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -27,6 +27,7 @@ from .utils import (
     pick_by_vector_similarity,
     process_chunks_unified,
     build_file_path,
+    truncate_entity_source_id,
     safe_vdb_operation_with_exception,
     create_prefixed_exception,
     fix_tuple_delimiter_corruption,
@@ -52,6 +53,7 @@ from .constants import (
     DEFAULT_KG_CHUNK_PICK_METHOD,
     DEFAULT_ENTITY_TYPES,
     DEFAULT_SUMMARY_LANGUAGE,
+    DEFAULT_MAX_CHUNK_IDS_PER_ENTITY,
 )
 from .kg.shared_storage import get_storage_keyed_lock
 import time
@@ -1371,9 +1373,11 @@ async def _merge_nodes_then_upsert(
         logger.error(f"Entity {entity_name} has no description")
         description = "(no description)"
 
-    source_id = GRAPH_FIELD_SEP.join(
-        set([dp["source_id"] for dp in nodes_data] + already_source_ids)
-    )
+    merged_source_ids: set = set([dp["source_id"] for dp in nodes_data] + already_source_ids)
+
+    source_ids = truncate_entity_source_id(merged_source_ids, entity_name)
+    source_id = GRAPH_FIELD_SEP.join(source_ids)
+
     file_path = build_file_path(already_file_paths, nodes_data, entity_name)
 
     node_data = dict(
@@ -1658,6 +1662,7 @@ async def merge_nodes_and_edges(
                 [entity_name], namespace=namespace, enable_logging=False
             ):
                 try:
+                    logger.info(f"Inserting {entity_name} in Graph")
                     # Graph database operation (critical path, must succeed)
                     entity_data = await _merge_nodes_then_upsert(
                         entity_name,
@@ -1673,7 +1678,7 @@ async def merge_nodes_and_edges(
                     if entity_vdb is not None and entity_data:
                         data_for_vdb = {
                             compute_mdhash_id(
-                                entity_data["entity_name"], prefix="ent-"
+                                str(entity_data["entity_name"]), prefix="ent-"
                             ): {
                                 "entity_name": entity_data["entity_name"],
                                 "entity_type": entity_data["entity_type"],
@@ -1685,6 +1690,8 @@ async def merge_nodes_and_edges(
                             }
                         }
 
+
+                        logger.info(f"Inserting {entity_name} in Graph")
                         # Use safe operation wrapper - VDB failure must throw exception
                         await safe_vdb_operation_with_exception(
                             operation=lambda: entity_vdb.upsert(data_for_vdb),
diff --git a/lightrag/utils.py b/lightrag/utils.py
index 83a3c394..17ee43a6 100644
--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@@ -26,6 +26,7 @@ from lightrag.constants import (
     GRAPH_FIELD_SEP,
     DEFAULT_MAX_TOTAL_TOKENS,
     DEFAULT_MAX_FILE_PATH_LENGTH,
+    DEFAULT_MAX_CHUNK_IDS_PER_ENTITY,
 )
 
 # Initialize logger with basic configuration
@@ -2464,6 +2465,20 @@ async def process_chunks_unified(
 
     return final_chunks
 
+def truncate_entity_source_id(chunk_ids: set, entity_name: str) -> set:
+    """Limit chunk_ids, for entities that appear a HUGE no of times (To not break VDB hard upper limits)"""
+    already_len: int = len(chunk_ids)
+
+    if already_len >= DEFAULT_MAX_CHUNK_IDS_PER_ENTITY:
+        logger.warning(
+            f"Chunk Ids already exceeds {DEFAULT_MAX_CHUNK_IDS_PER_ENTITY} for {entity_name}, "
+            f"current size: {already_len} entries."
+        )
+    
+    truncated_chunk_ids = set(list(chunk_ids)[0:DEFAULT_MAX_CHUNK_IDS_PER_ENTITY])
+
+    return truncated_chunk_ids
+
 
 def build_file_path(already_file_paths, data_list, target):
     """Build file path string with UTF-8 byte length limit and deduplication

From d52c3377b4cb1d3be0ed13e8e7a7119f2c73d348 Mon Sep 17 00:00:00 2001
From: haseebuchiha <haseeb9009@gmail.com>
Date: Tue, 14 Oct 2025 16:14:03 +0500
Subject: [PATCH 02/25] Import from env and use default if none and removed
 useless import

---
 env.example         | 2 ++
 lightrag/operate.py | 1 -
 lightrag/utils.py   | 8 +++++---
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/env.example b/env.example
index 4c8d355d..1d2b81f3 100644
--- a/env.example
+++ b/env.example
@@ -73,6 +73,8 @@ ENABLE_LLM_CACHE=true
 # MAX_RELATION_TOKENS=8000
 ### control the maximum tokens send to LLM (include entities, relations and chunks)
 # MAX_TOTAL_TOKENS=30000
+### control the maximum chunk_ids stored in vector db
+# MAX_CHUNK_IDS_PER_ENTITY=500
 
 ### maximum number of related chunks per source entity or relation
 ###     The chunk picker uses this value to determine the total number of chunks selected from KG(knowledge graph)
diff --git a/lightrag/operate.py b/lightrag/operate.py
index 29a17e68..34a8a613 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -53,7 +53,6 @@ from .constants import (
     DEFAULT_KG_CHUNK_PICK_METHOD,
     DEFAULT_ENTITY_TYPES,
     DEFAULT_SUMMARY_LANGUAGE,
-    DEFAULT_MAX_CHUNK_IDS_PER_ENTITY,
 )
 from .kg.shared_storage import get_storage_keyed_lock
 import time
diff --git a/lightrag/utils.py b/lightrag/utils.py
index 17ee43a6..b33c5a15 100644
--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@@ -2469,13 +2469,15 @@ def truncate_entity_source_id(chunk_ids: set, entity_name: str) -> set:
     """Limit chunk_ids, for entities that appear a HUGE no of times (To not break VDB hard upper limits)"""
     already_len: int = len(chunk_ids)
 
-    if already_len >= DEFAULT_MAX_CHUNK_IDS_PER_ENTITY:
+    max_chunk_ids_per_entity = get_env_value("MAX_CHUNK_IDS_PER_ENTITY", DEFAULT_MAX_CHUNK_IDS_PER_ENTITY, int)
+
+    if already_len >= max_chunk_ids_per_entity:
         logger.warning(
-            f"Chunk Ids already exceeds {DEFAULT_MAX_CHUNK_IDS_PER_ENTITY} for {entity_name}, "
+            f"Chunk Ids already exceeds {max_chunk_ids_per_entity } for {entity_name}, "
             f"current size: {already_len} entries."
         )
     
-    truncated_chunk_ids = set(list(chunk_ids)[0:DEFAULT_MAX_CHUNK_IDS_PER_ENTITY])
+    truncated_chunk_ids = set(list(chunk_ids)[0:max_chunk_ids_per_entity ])
 
     return truncated_chunk_ids
 

From c06522b927da81e68898f94c45fa918458d851df Mon Sep 17 00:00:00 2001
From: DivinesLight <munir@loop.pk>
Date: Wed, 15 Oct 2025 18:24:38 +0500
Subject: [PATCH 03/25] Get max source Id config from .env and lightRAG init

---
 env.example           |  4 ++--
 lightrag/constants.py |  2 +-
 lightrag/lightrag.py  |  6 ++++++
 lightrag/operate.py   |  6 +++---
 lightrag/utils.py     | 21 +++++++++++----------
 5 files changed, 23 insertions(+), 16 deletions(-)

diff --git a/env.example b/env.example
index 1d2b81f3..e0b649e3 100644
--- a/env.example
+++ b/env.example
@@ -73,8 +73,8 @@ ENABLE_LLM_CACHE=true
 # MAX_RELATION_TOKENS=8000
 ### control the maximum tokens send to LLM (include entities, relations and chunks)
 # MAX_TOTAL_TOKENS=30000
-### control the maximum chunk_ids stored in vector db
-# MAX_CHUNK_IDS_PER_ENTITY=500
+### control the maximum chunk_ids stored
+# MAX_SOURCE_IDS_PER_ENTITY=500
 
 ### maximum number of related chunks per source entity or relation
 ###     The chunk picker uses this value to determine the total number of chunks selected from KG(knowledge graph)
diff --git a/lightrag/constants.py b/lightrag/constants.py
index 6fb9feb4..f7b5c41f 100644
--- a/lightrag/constants.py
+++ b/lightrag/constants.py
@@ -13,7 +13,7 @@ DEFAULT_MAX_GRAPH_NODES = 1000
 # Default values for extraction settings
 DEFAULT_SUMMARY_LANGUAGE = "English"  # Default language for document processing
 DEFAULT_MAX_GLEANING = 1
-DEFAULT_MAX_CHUNK_IDS_PER_ENTITY = 500 # Applies to Both Graph + Vector DBs
+DEFAULT_MAX_SOURCE_IDS_PER_ENTITY = 500 # Applies to Both Graph + Vector DBs
 
 # Number of description fragments to trigger LLM summary
 DEFAULT_FORCE_LLM_SUMMARY_ON_MERGE = 8
diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py
index d288685e..2b18f961 100644
--- a/lightrag/lightrag.py
+++ b/lightrag/lightrag.py
@@ -39,6 +39,7 @@ from lightrag.constants import (
     DEFAULT_MAX_ASYNC,
     DEFAULT_MAX_PARALLEL_INSERT,
     DEFAULT_MAX_GRAPH_NODES,
+    DEFAULT_MAX_SOURCE_IDS_PER_ENTITY,
     DEFAULT_ENTITY_TYPES,
     DEFAULT_SUMMARY_LANGUAGE,
     DEFAULT_LLM_TIMEOUT,
@@ -359,6 +360,11 @@ class LightRAG:
     )
     """Maximum number of graph nodes to return in knowledge graph queries."""
 
+    max_source_ids_per_entity: int = field(
+        default=get_env_value("MAX_SOURCE_IDS_PER_ENTITY", DEFAULT_MAX_SOURCE_IDS_PER_ENTITY, int)
+    )
+    """Maximum number of source (chunk) ids in entity Grpah + VDB."""
+
     addon_params: dict[str, Any] = field(
         default_factory=lambda: {
             "language": get_env_value(
diff --git a/lightrag/operate.py b/lightrag/operate.py
index 34a8a613..7a8b6391 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -1374,7 +1374,7 @@ async def _merge_nodes_then_upsert(
 
     merged_source_ids: set = set([dp["source_id"] for dp in nodes_data] + already_source_ids)
 
-    source_ids = truncate_entity_source_id(merged_source_ids, entity_name)
+    source_ids = truncate_entity_source_id(merged_source_ids, entity_name, global_config)
     source_id = GRAPH_FIELD_SEP.join(source_ids)
 
     file_path = build_file_path(already_file_paths, nodes_data, entity_name)
@@ -1661,7 +1661,7 @@ async def merge_nodes_and_edges(
                 [entity_name], namespace=namespace, enable_logging=False
             ):
                 try:
-                    logger.info(f"Inserting {entity_name} in Graph")
+                    logger.debug(f"Inserting {entity_name} in Graph")
                     # Graph database operation (critical path, must succeed)
                     entity_data = await _merge_nodes_then_upsert(
                         entity_name,
@@ -1690,7 +1690,7 @@ async def merge_nodes_and_edges(
                         }
 
 
-                        logger.info(f"Inserting {entity_name} in Graph")
+                        logger.debug(f"Inserting {entity_name} in Graph")
                         # Use safe operation wrapper - VDB failure must throw exception
                         await safe_vdb_operation_with_exception(
                             operation=lambda: entity_vdb.upsert(data_for_vdb),
diff --git a/lightrag/utils.py b/lightrag/utils.py
index b33c5a15..cf585016 100644
--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@@ -26,7 +26,6 @@ from lightrag.constants import (
     GRAPH_FIELD_SEP,
     DEFAULT_MAX_TOTAL_TOKENS,
     DEFAULT_MAX_FILE_PATH_LENGTH,
-    DEFAULT_MAX_CHUNK_IDS_PER_ENTITY,
 )
 
 # Initialize logger with basic configuration
@@ -2465,23 +2464,25 @@ async def process_chunks_unified(
 
     return final_chunks
 
-def truncate_entity_source_id(chunk_ids: set, entity_name: str) -> set:
+def truncate_entity_source_id(chunk_ids: set, entity_name: str, global_config: dict) -> set:
     """Limit chunk_ids, for entities that appear a HUGE no of times (To not break VDB hard upper limits)"""
     already_len: int = len(chunk_ids)
 
-    max_chunk_ids_per_entity = get_env_value("MAX_CHUNK_IDS_PER_ENTITY", DEFAULT_MAX_CHUNK_IDS_PER_ENTITY, int)
+    max_chunk_ids_per_entity = global_config["max_source_ids_per_entity"]
+
+    if already_len <= max_chunk_ids_per_entity:
+        return chunk_ids
+
+    logger.warning(
+        f"Source Ids already exceeds {max_chunk_ids_per_entity } for {entity_name}, "
+        f"current size: {already_len}, truncating..."
+    )
 
-    if already_len >= max_chunk_ids_per_entity:
-        logger.warning(
-            f"Chunk Ids already exceeds {max_chunk_ids_per_entity } for {entity_name}, "
-            f"current size: {already_len} entries."
-        )
-    
     truncated_chunk_ids = set(list(chunk_ids)[0:max_chunk_ids_per_entity ])
-
     return truncated_chunk_ids
 
 
+
 def build_file_path(already_file_paths, data_list, target):
     """Build file path string with UTF-8 byte length limit and deduplication
 

From 7871600d8a101504f06566c87bf6bc9125206330 Mon Sep 17 00:00:00 2001
From: DivinesLight <munir@loop.pk>
Date: Tue, 14 Oct 2025 14:47:04 +0500
Subject: [PATCH 04/25] Quick fix to limit source_id ballooning while inserting
 nodes

---
 lightrag/constants.py |  1 +
 lightrag/operate.py   | 13 ++++++++++---
 lightrag/utils.py     | 15 +++++++++++++++
 3 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/lightrag/constants.py b/lightrag/constants.py
index 14584559..6fb9feb4 100644
--- a/lightrag/constants.py
+++ b/lightrag/constants.py
@@ -13,6 +13,7 @@ DEFAULT_MAX_GRAPH_NODES = 1000
 # Default values for extraction settings
 DEFAULT_SUMMARY_LANGUAGE = "English"  # Default language for document processing
 DEFAULT_MAX_GLEANING = 1
+DEFAULT_MAX_CHUNK_IDS_PER_ENTITY = 500 # Applies to Both Graph + Vector DBs
 
 # Number of description fragments to trigger LLM summary
 DEFAULT_FORCE_LLM_SUMMARY_ON_MERGE = 8
diff --git a/lightrag/operate.py b/lightrag/operate.py
index a12cb63f..cee8f377 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -27,6 +27,7 @@ from .utils import (
     pick_by_vector_similarity,
     process_chunks_unified,
     build_file_path,
+    truncate_entity_source_id,
     safe_vdb_operation_with_exception,
     create_prefixed_exception,
     fix_tuple_delimiter_corruption,
@@ -52,6 +53,7 @@ from .constants import (
     DEFAULT_KG_CHUNK_PICK_METHOD,
     DEFAULT_ENTITY_TYPES,
     DEFAULT_SUMMARY_LANGUAGE,
+    DEFAULT_MAX_CHUNK_IDS_PER_ENTITY,
 )
 from .kg.shared_storage import get_storage_keyed_lock
 import time
@@ -1371,9 +1373,11 @@ async def _merge_nodes_then_upsert(
         logger.error(f"Entity {entity_name} has no description")
         description = "(no description)"
 
-    source_id = GRAPH_FIELD_SEP.join(
-        set([dp["source_id"] for dp in nodes_data] + already_source_ids)
-    )
+    merged_source_ids: set = set([dp["source_id"] for dp in nodes_data] + already_source_ids)
+
+    source_ids = truncate_entity_source_id(merged_source_ids, entity_name)
+    source_id = GRAPH_FIELD_SEP.join(source_ids)
+
     file_path = build_file_path(already_file_paths, nodes_data, entity_name)
 
     node_data = dict(
@@ -1658,6 +1662,7 @@ async def merge_nodes_and_edges(
                 [entity_name], namespace=namespace, enable_logging=False
             ):
                 try:
+                    logger.info(f"Inserting {entity_name} in Graph")
                     # Graph database operation (critical path, must succeed)
                     entity_data = await _merge_nodes_then_upsert(
                         entity_name,
@@ -1685,6 +1690,8 @@ async def merge_nodes_and_edges(
                             }
                         }
 
+
+                        logger.info(f"Inserting {entity_name} in Graph")
                         # Use safe operation wrapper - VDB failure must throw exception
                         await safe_vdb_operation_with_exception(
                             operation=lambda: entity_vdb.upsert(data_for_vdb),
diff --git a/lightrag/utils.py b/lightrag/utils.py
index 83a3c394..17ee43a6 100644
--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@@ -26,6 +26,7 @@ from lightrag.constants import (
     GRAPH_FIELD_SEP,
     DEFAULT_MAX_TOTAL_TOKENS,
     DEFAULT_MAX_FILE_PATH_LENGTH,
+    DEFAULT_MAX_CHUNK_IDS_PER_ENTITY,
 )
 
 # Initialize logger with basic configuration
@@ -2464,6 +2465,20 @@ async def process_chunks_unified(
 
     return final_chunks
 
+def truncate_entity_source_id(chunk_ids: set, entity_name: str) -> set:
+    """Limit chunk_ids, for entities that appear a HUGE no of times (To not break VDB hard upper limits)"""
+    already_len: int = len(chunk_ids)
+
+    if already_len >= DEFAULT_MAX_CHUNK_IDS_PER_ENTITY:
+        logger.warning(
+            f"Chunk Ids already exceeds {DEFAULT_MAX_CHUNK_IDS_PER_ENTITY} for {entity_name}, "
+            f"current size: {already_len} entries."
+        )
+    
+    truncated_chunk_ids = set(list(chunk_ids)[0:DEFAULT_MAX_CHUNK_IDS_PER_ENTITY])
+
+    return truncated_chunk_ids
+
 
 def build_file_path(already_file_paths, data_list, target):
     """Build file path string with UTF-8 byte length limit and deduplication

From 4e740af79b538653d127708aecfc9a109d276d17 Mon Sep 17 00:00:00 2001
From: haseebuchiha <haseeb9009@gmail.com>
Date: Tue, 14 Oct 2025 16:14:03 +0500
Subject: [PATCH 05/25] Import from env and use default if none and removed
 useless import

---
 env.example         | 2 ++
 lightrag/operate.py | 1 -
 lightrag/utils.py   | 8 +++++---
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/env.example b/env.example
index 4c8d355d..1d2b81f3 100644
--- a/env.example
+++ b/env.example
@@ -73,6 +73,8 @@ ENABLE_LLM_CACHE=true
 # MAX_RELATION_TOKENS=8000
 ### control the maximum tokens send to LLM (include entities, relations and chunks)
 # MAX_TOTAL_TOKENS=30000
+### control the maximum chunk_ids stored in vector db
+# MAX_CHUNK_IDS_PER_ENTITY=500
 
 ### maximum number of related chunks per source entity or relation
 ###     The chunk picker uses this value to determine the total number of chunks selected from KG(knowledge graph)
diff --git a/lightrag/operate.py b/lightrag/operate.py
index cee8f377..0476d169 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -53,7 +53,6 @@ from .constants import (
     DEFAULT_KG_CHUNK_PICK_METHOD,
     DEFAULT_ENTITY_TYPES,
     DEFAULT_SUMMARY_LANGUAGE,
-    DEFAULT_MAX_CHUNK_IDS_PER_ENTITY,
 )
 from .kg.shared_storage import get_storage_keyed_lock
 import time
diff --git a/lightrag/utils.py b/lightrag/utils.py
index 17ee43a6..b33c5a15 100644
--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@@ -2469,13 +2469,15 @@ def truncate_entity_source_id(chunk_ids: set, entity_name: str) -> set:
     """Limit chunk_ids, for entities that appear a HUGE no of times (To not break VDB hard upper limits)"""
     already_len: int = len(chunk_ids)
 
-    if already_len >= DEFAULT_MAX_CHUNK_IDS_PER_ENTITY:
+    max_chunk_ids_per_entity = get_env_value("MAX_CHUNK_IDS_PER_ENTITY", DEFAULT_MAX_CHUNK_IDS_PER_ENTITY, int)
+
+    if already_len >= max_chunk_ids_per_entity:
         logger.warning(
-            f"Chunk Ids already exceeds {DEFAULT_MAX_CHUNK_IDS_PER_ENTITY} for {entity_name}, "
+            f"Chunk Ids already exceeds {max_chunk_ids_per_entity } for {entity_name}, "
             f"current size: {already_len} entries."
         )
     
-    truncated_chunk_ids = set(list(chunk_ids)[0:DEFAULT_MAX_CHUNK_IDS_PER_ENTITY])
+    truncated_chunk_ids = set(list(chunk_ids)[0:max_chunk_ids_per_entity ])
 
     return truncated_chunk_ids
 

From 17c2a929d2c6aa4e5767f21e4b20348be72b3184 Mon Sep 17 00:00:00 2001
From: DivinesLight <munir@loop.pk>
Date: Wed, 15 Oct 2025 18:24:38 +0500
Subject: [PATCH 06/25] Get max source Id config from .env and lightRAG init

---
 env.example           |  4 ++--
 lightrag/constants.py |  2 +-
 lightrag/lightrag.py  |  6 ++++++
 lightrag/operate.py   |  6 +++---
 lightrag/utils.py     | 21 +++++++++++----------
 5 files changed, 23 insertions(+), 16 deletions(-)

diff --git a/env.example b/env.example
index 1d2b81f3..e0b649e3 100644
--- a/env.example
+++ b/env.example
@@ -73,8 +73,8 @@ ENABLE_LLM_CACHE=true
 # MAX_RELATION_TOKENS=8000
 ### control the maximum tokens send to LLM (include entities, relations and chunks)
 # MAX_TOTAL_TOKENS=30000
-### control the maximum chunk_ids stored in vector db
-# MAX_CHUNK_IDS_PER_ENTITY=500
+### control the maximum chunk_ids stored
+# MAX_SOURCE_IDS_PER_ENTITY=500
 
 ### maximum number of related chunks per source entity or relation
 ###     The chunk picker uses this value to determine the total number of chunks selected from KG(knowledge graph)
diff --git a/lightrag/constants.py b/lightrag/constants.py
index 6fb9feb4..f7b5c41f 100644
--- a/lightrag/constants.py
+++ b/lightrag/constants.py
@@ -13,7 +13,7 @@ DEFAULT_MAX_GRAPH_NODES = 1000
 # Default values for extraction settings
 DEFAULT_SUMMARY_LANGUAGE = "English"  # Default language for document processing
 DEFAULT_MAX_GLEANING = 1
-DEFAULT_MAX_CHUNK_IDS_PER_ENTITY = 500 # Applies to Both Graph + Vector DBs
+DEFAULT_MAX_SOURCE_IDS_PER_ENTITY = 500 # Applies to Both Graph + Vector DBs
 
 # Number of description fragments to trigger LLM summary
 DEFAULT_FORCE_LLM_SUMMARY_ON_MERGE = 8
diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py
index d288685e..2b18f961 100644
--- a/lightrag/lightrag.py
+++ b/lightrag/lightrag.py
@@ -39,6 +39,7 @@ from lightrag.constants import (
     DEFAULT_MAX_ASYNC,
     DEFAULT_MAX_PARALLEL_INSERT,
     DEFAULT_MAX_GRAPH_NODES,
+    DEFAULT_MAX_SOURCE_IDS_PER_ENTITY,
     DEFAULT_ENTITY_TYPES,
     DEFAULT_SUMMARY_LANGUAGE,
     DEFAULT_LLM_TIMEOUT,
@@ -359,6 +360,11 @@ class LightRAG:
     )
     """Maximum number of graph nodes to return in knowledge graph queries."""
 
+    max_source_ids_per_entity: int = field(
+        default=get_env_value("MAX_SOURCE_IDS_PER_ENTITY", DEFAULT_MAX_SOURCE_IDS_PER_ENTITY, int)
+    )
+    """Maximum number of source (chunk) ids in entity Grpah + VDB."""
+
     addon_params: dict[str, Any] = field(
         default_factory=lambda: {
             "language": get_env_value(
diff --git a/lightrag/operate.py b/lightrag/operate.py
index 0476d169..12afffa1 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -1374,7 +1374,7 @@ async def _merge_nodes_then_upsert(
 
     merged_source_ids: set = set([dp["source_id"] for dp in nodes_data] + already_source_ids)
 
-    source_ids = truncate_entity_source_id(merged_source_ids, entity_name)
+    source_ids = truncate_entity_source_id(merged_source_ids, entity_name, global_config)
     source_id = GRAPH_FIELD_SEP.join(source_ids)
 
     file_path = build_file_path(already_file_paths, nodes_data, entity_name)
@@ -1661,7 +1661,7 @@ async def merge_nodes_and_edges(
                 [entity_name], namespace=namespace, enable_logging=False
             ):
                 try:
-                    logger.info(f"Inserting {entity_name} in Graph")
+                    logger.debug(f"Inserting {entity_name} in Graph")
                     # Graph database operation (critical path, must succeed)
                     entity_data = await _merge_nodes_then_upsert(
                         entity_name,
@@ -1690,7 +1690,7 @@ async def merge_nodes_and_edges(
                         }
 
 
-                        logger.info(f"Inserting {entity_name} in Graph")
+                        logger.debug(f"Inserting {entity_name} in Graph")
                         # Use safe operation wrapper - VDB failure must throw exception
                         await safe_vdb_operation_with_exception(
                             operation=lambda: entity_vdb.upsert(data_for_vdb),
diff --git a/lightrag/utils.py b/lightrag/utils.py
index b33c5a15..cf585016 100644
--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@@ -26,7 +26,6 @@ from lightrag.constants import (
     GRAPH_FIELD_SEP,
     DEFAULT_MAX_TOTAL_TOKENS,
     DEFAULT_MAX_FILE_PATH_LENGTH,
-    DEFAULT_MAX_CHUNK_IDS_PER_ENTITY,
 )
 
 # Initialize logger with basic configuration
@@ -2465,23 +2464,25 @@ async def process_chunks_unified(
 
     return final_chunks
 
-def truncate_entity_source_id(chunk_ids: set, entity_name: str) -> set:
+def truncate_entity_source_id(chunk_ids: set, entity_name: str, global_config: dict) -> set:
     """Limit chunk_ids, for entities that appear a HUGE no of times (To not break VDB hard upper limits)"""
     already_len: int = len(chunk_ids)
 
-    max_chunk_ids_per_entity = get_env_value("MAX_CHUNK_IDS_PER_ENTITY", DEFAULT_MAX_CHUNK_IDS_PER_ENTITY, int)
+    max_chunk_ids_per_entity = global_config["max_source_ids_per_entity"]
+
+    if already_len <= max_chunk_ids_per_entity:
+        return chunk_ids
+
+    logger.warning(
+        f"Source Ids already exceeds {max_chunk_ids_per_entity } for {entity_name}, "
+        f"current size: {already_len}, truncating..."
+    )
 
-    if already_len >= max_chunk_ids_per_entity:
-        logger.warning(
-            f"Chunk Ids already exceeds {max_chunk_ids_per_entity } for {entity_name}, "
-            f"current size: {already_len} entries."
-        )
-    
     truncated_chunk_ids = set(list(chunk_ids)[0:max_chunk_ids_per_entity ])
-
     return truncated_chunk_ids
 
 
+
 def build_file_path(already_file_paths, data_list, target):
     """Build file path string with UTF-8 byte length limit and deduplication
 

From dc62c78f981f67fa9f7e895b7b401035c788271f Mon Sep 17 00:00:00 2001
From: yangdx <gzdaniel@me.com>
Date: Mon, 20 Oct 2025 15:24:15 +0800
Subject: [PATCH 07/25] Add entity/relation chunk tracking with configurable
 source ID limits

- Add entity_chunks & relation_chunks storage
- Implement KEEP/FIFO limit strategies
- Update env.example with new settings
- Add migration for chunk tracking data
- Support all KV storage
---
 env.example                             |   7 +-
 lightrag/api/routers/document_routes.py |   2 +
 lightrag/base.py                        |   8 +
 lightrag/constants.py                   |  11 +-
 lightrag/kg/json_doc_status_impl.py     |  14 +
 lightrag/kg/json_kv_impl.py             |  29 +-
 lightrag/kg/mongo_impl.py               |  44 ++-
 lightrag/kg/postgres_impl.py            | 303 +++++++++++------
 lightrag/kg/redis_impl.py               |  80 ++---
 lightrag/lightrag.py                    | 338 +++++++++++++++++--
 lightrag/namespace.py                   |   2 +
 lightrag/operate.py                     | 423 ++++++++++++++++++++----
 lightrag/utils.py                       | 127 ++++++-
 13 files changed, 1098 insertions(+), 290 deletions(-)

diff --git a/env.example b/env.example
index b08f1758..6d53c390 100644
--- a/env.example
+++ b/env.example
@@ -73,8 +73,11 @@ ENABLE_LLM_CACHE=true
 # MAX_RELATION_TOKENS=8000
 ### control the maximum tokens send to LLM (include entities, relations and chunks)
 # MAX_TOTAL_TOKENS=30000
-### control the maximum chunk_ids stored
-# MAX_SOURCE_IDS_PER_ENTITY=500
+### control the maximum chunk_ids stored in vector and graph db
+# MAX_SOURCE_IDS_PER_ENTITY=300
+# MAX_SOURCE_IDS_PER_RELATION=300
+### control chunk_ids limitation method: KEEP, FIFO (KEPP: Ingore New Chunks, FIFO: New chunks replace old chunks)
+# SOURCE_IDS_LIMIT_METHOD=KEEP
 
 ### maximum number of related chunks per source entity or relation
 ###     The chunk picker uses this value to determine the total number of chunks selected from KG(knowledge graph)
diff --git a/lightrag/api/routers/document_routes.py b/lightrag/api/routers/document_routes.py
index 7dc2c34c..0ed5a711 100644
--- a/lightrag/api/routers/document_routes.py
+++ b/lightrag/api/routers/document_routes.py
@@ -2003,6 +2003,8 @@ def create_document_routes(
                 rag.full_docs,
                 rag.full_entities,
                 rag.full_relations,
+                rag.entity_chunks,
+                rag.relation_chunks,
                 rag.entities_vdb,
                 rag.relationships_vdb,
                 rag.chunks_vdb,
diff --git a/lightrag/base.py b/lightrag/base.py
index 45c5cb2c..e569de2a 100644
--- a/lightrag/base.py
+++ b/lightrag/base.py
@@ -355,6 +355,14 @@ class BaseKVStorage(StorageNameSpace, ABC):
             None
         """
 
+    @abstractmethod
+    async def is_empty(self) -> bool:
+        """Check if the storage is empty
+
+        Returns:
+            bool: True if storage contains no data, False otherwise
+        """
+
 
 @dataclass
 class BaseGraphStorage(StorageNameSpace, ABC):
diff --git a/lightrag/constants.py b/lightrag/constants.py
index f7b5c41f..e374a991 100644
--- a/lightrag/constants.py
+++ b/lightrag/constants.py
@@ -13,7 +13,16 @@ DEFAULT_MAX_GRAPH_NODES = 1000
 # Default values for extraction settings
 DEFAULT_SUMMARY_LANGUAGE = "English"  # Default language for document processing
 DEFAULT_MAX_GLEANING = 1
-DEFAULT_MAX_SOURCE_IDS_PER_ENTITY = 500 # Applies to Both Graph + Vector DBs
+
+DEFAULT_MAX_SOURCE_IDS_PER_ENTITY = 3
+DEFAULT_MAX_SOURCE_IDS_PER_RELATION = 3
+SOURCE_IDS_LIMIT_METHOD_KEEP = "KEEP"
+SOURCE_IDS_LIMIT_METHOD_FIFO = "FIFO"
+DEFAULT_SOURCE_IDS_LIMIT_METHOD = SOURCE_IDS_LIMIT_METHOD_KEEP
+VALID_SOURCE_IDS_LIMIT_METHODS = {
+    SOURCE_IDS_LIMIT_METHOD_KEEP,
+    SOURCE_IDS_LIMIT_METHOD_FIFO,
+}
 
 # Number of description fragments to trigger LLM summary
 DEFAULT_FORCE_LLM_SUMMARY_ON_MERGE = 8
diff --git a/lightrag/kg/json_doc_status_impl.py b/lightrag/kg/json_doc_status_impl.py
index e6d101a7..014499f2 100644
--- a/lightrag/kg/json_doc_status_impl.py
+++ b/lightrag/kg/json_doc_status_impl.py
@@ -187,6 +187,20 @@ class JsonDocStatusStorage(DocStatusStorage):
 
         await self.index_done_callback()
 
+    async def is_empty(self) -> bool:
+        """Check if the storage is empty
+
+        Returns:
+            bool: True if storage is empty, False otherwise
+
+        Raises:
+            StorageNotInitializedError: If storage is not initialized
+        """
+        if self._storage_lock is None:
+            raise StorageNotInitializedError("JsonDocStatusStorage")
+        async with self._storage_lock:
+            return len(self._data) == 0
+
     async def get_by_id(self, id: str) -> Union[dict[str, Any], None]:
         async with self._storage_lock:
             return self._data.get(id)
diff --git a/lightrag/kg/json_kv_impl.py b/lightrag/kg/json_kv_impl.py
index 553ba417..fd016b14 100644
--- a/lightrag/kg/json_kv_impl.py
+++ b/lightrag/kg/json_kv_impl.py
@@ -84,26 +84,6 @@ class JsonKVStorage(BaseKVStorage):
                 write_json(data_dict, self._file_name)
                 await clear_all_update_flags(self.final_namespace)
 
-    async def get_all(self) -> dict[str, Any]:
-        """Get all data from storage
-
-        Returns:
-            Dictionary containing all stored data
-        """
-        async with self._storage_lock:
-            result = {}
-            for key, value in self._data.items():
-                if value:
-                    # Create a copy to avoid modifying the original data
-                    data = dict(value)
-                    # Ensure time fields are present, provide default values for old data
-                    data.setdefault("create_time", 0)
-                    data.setdefault("update_time", 0)
-                    result[key] = data
-                else:
-                    result[key] = value
-            return result
-
     async def get_by_id(self, id: str) -> dict[str, Any] | None:
         async with self._storage_lock:
             result = self._data.get(id)
@@ -200,6 +180,15 @@ class JsonKVStorage(BaseKVStorage):
             if any_deleted:
                 await set_all_update_flags(self.final_namespace)
 
+    async def is_empty(self) -> bool:
+        """Check if the storage is empty
+
+        Returns:
+            bool: True if storage contains no data, False otherwise
+        """
+        async with self._storage_lock:
+            return len(self._data) == 0
+
     async def drop(self) -> dict[str, str]:
         """Drop all data from storage and clean up resources
            This action will persistent the data to disk immediately.
diff --git a/lightrag/kg/mongo_impl.py b/lightrag/kg/mongo_impl.py
index a62c3031..e55062f1 100644
--- a/lightrag/kg/mongo_impl.py
+++ b/lightrag/kg/mongo_impl.py
@@ -175,22 +175,6 @@ class MongoKVStorage(BaseKVStorage):
         existing_ids = {str(x["_id"]) async for x in cursor}
         return keys - existing_ids
 
-    async def get_all(self) -> dict[str, Any]:
-        """Get all data from storage
-
-        Returns:
-            Dictionary containing all stored data
-        """
-        cursor = self._data.find({})
-        result = {}
-        async for doc in cursor:
-            doc_id = doc.pop("_id")
-            # Ensure time fields are present for all documents
-            doc.setdefault("create_time", 0)
-            doc.setdefault("update_time", 0)
-            result[doc_id] = doc
-        return result
-
     async def upsert(self, data: dict[str, dict[str, Any]]) -> None:
         logger.debug(f"[{self.workspace}] Inserting {len(data)} to {self.namespace}")
         if not data:
@@ -236,6 +220,20 @@ class MongoKVStorage(BaseKVStorage):
         # Mongo handles persistence automatically
         pass
 
+    async def is_empty(self) -> bool:
+        """Check if the storage is empty for the current workspace and namespace
+
+        Returns:
+            bool: True if storage is empty, False otherwise
+        """
+        try:
+            # Use count_documents with limit 1 for efficiency
+            count = await self._data.count_documents({}, limit=1)
+            return count == 0
+        except PyMongoError as e:
+            logger.error(f"[{self.workspace}] Error checking if storage is empty: {e}")
+            return True
+
     async def delete(self, ids: list[str]) -> None:
         """Delete documents with specified IDs
 
@@ -466,6 +464,20 @@ class MongoDocStatusStorage(DocStatusStorage):
         # Mongo handles persistence automatically
         pass
 
+    async def is_empty(self) -> bool:
+        """Check if the storage is empty for the current workspace and namespace
+
+        Returns:
+            bool: True if storage is empty, False otherwise
+        """
+        try:
+            # Use count_documents with limit 1 for efficiency
+            count = await self._data.count_documents({}, limit=1)
+            return count == 0
+        except PyMongoError as e:
+            logger.error(f"[{self.workspace}] Error checking if storage is empty: {e}")
+            return True
+
     async def drop(self) -> dict[str, str]:
         """Drop the storage by removing all documents in the collection.
 
diff --git a/lightrag/kg/postgres_impl.py b/lightrag/kg/postgres_impl.py
index 54bdf0f6..3899fa20 100644
--- a/lightrag/kg/postgres_impl.py
+++ b/lightrag/kg/postgres_impl.py
@@ -1656,113 +1656,6 @@ class PGKVStorage(BaseKVStorage):
                 self.db = None
 
     ################ QUERY METHODS ################
-    async def get_all(self) -> dict[str, Any]:
-        """Get all data from storage
-
-        Returns:
-            Dictionary containing all stored data
-        """
-        table_name = namespace_to_table_name(self.namespace)
-        if not table_name:
-            logger.error(
-                f"[{self.workspace}] Unknown namespace for get_all: {self.namespace}"
-            )
-            return {}
-
-        sql = f"SELECT * FROM {table_name} WHERE workspace=$1"
-        params = {"workspace": self.workspace}
-
-        try:
-            results = await self.db.query(sql, list(params.values()), multirows=True)
-
-            # Special handling for LLM cache to ensure compatibility with _get_cached_extraction_results
-            if is_namespace(self.namespace, NameSpace.KV_STORE_LLM_RESPONSE_CACHE):
-                processed_results = {}
-                for row in results:
-                    create_time = row.get("create_time", 0)
-                    update_time = row.get("update_time", 0)
-                    # Map field names and add cache_type for compatibility
-                    processed_row = {
-                        **row,
-                        "return": row.get("return_value", ""),
-                        "cache_type": row.get("original_prompt", "unknow"),
-                        "original_prompt": row.get("original_prompt", ""),
-                        "chunk_id": row.get("chunk_id"),
-                        "mode": row.get("mode", "default"),
-                        "create_time": create_time,
-                        "update_time": create_time if update_time == 0 else update_time,
-                    }
-                    processed_results[row["id"]] = processed_row
-                return processed_results
-
-            # For text_chunks namespace, parse llm_cache_list JSON string back to list
-            if is_namespace(self.namespace, NameSpace.KV_STORE_TEXT_CHUNKS):
-                processed_results = {}
-                for row in results:
-                    llm_cache_list = row.get("llm_cache_list", [])
-                    if isinstance(llm_cache_list, str):
-                        try:
-                            llm_cache_list = json.loads(llm_cache_list)
-                        except json.JSONDecodeError:
-                            llm_cache_list = []
-                    row["llm_cache_list"] = llm_cache_list
-                    create_time = row.get("create_time", 0)
-                    update_time = row.get("update_time", 0)
-                    row["create_time"] = create_time
-                    row["update_time"] = (
-                        create_time if update_time == 0 else update_time
-                    )
-                    processed_results[row["id"]] = row
-                return processed_results
-
-            # For FULL_ENTITIES namespace, parse entity_names JSON string back to list
-            if is_namespace(self.namespace, NameSpace.KV_STORE_FULL_ENTITIES):
-                processed_results = {}
-                for row in results:
-                    entity_names = row.get("entity_names", [])
-                    if isinstance(entity_names, str):
-                        try:
-                            entity_names = json.loads(entity_names)
-                        except json.JSONDecodeError:
-                            entity_names = []
-                    row["entity_names"] = entity_names
-                    create_time = row.get("create_time", 0)
-                    update_time = row.get("update_time", 0)
-                    row["create_time"] = create_time
-                    row["update_time"] = (
-                        create_time if update_time == 0 else update_time
-                    )
-                    processed_results[row["id"]] = row
-                return processed_results
-
-            # For FULL_RELATIONS namespace, parse relation_pairs JSON string back to list
-            if is_namespace(self.namespace, NameSpace.KV_STORE_FULL_RELATIONS):
-                processed_results = {}
-                for row in results:
-                    relation_pairs = row.get("relation_pairs", [])
-                    if isinstance(relation_pairs, str):
-                        try:
-                            relation_pairs = json.loads(relation_pairs)
-                        except json.JSONDecodeError:
-                            relation_pairs = []
-                    row["relation_pairs"] = relation_pairs
-                    create_time = row.get("create_time", 0)
-                    update_time = row.get("update_time", 0)
-                    row["create_time"] = create_time
-                    row["update_time"] = (
-                        create_time if update_time == 0 else update_time
-                    )
-                    processed_results[row["id"]] = row
-                return processed_results
-
-            # For other namespaces, return as-is
-            return {row["id"]: row for row in results}
-        except Exception as e:
-            logger.error(
-                f"[{self.workspace}] Error retrieving all data from {self.namespace}: {e}"
-            )
-            return {}
-
     async def get_by_id(self, id: str) -> dict[str, Any] | None:
         """Get data by id."""
         sql = SQL_TEMPLATES["get_by_id_" + self.namespace]
@@ -1838,6 +1731,38 @@ class PGKVStorage(BaseKVStorage):
             response["create_time"] = create_time
             response["update_time"] = create_time if update_time == 0 else update_time
 
+        # Special handling for ENTITY_CHUNKS namespace
+        if response and is_namespace(self.namespace, NameSpace.KV_STORE_ENTITY_CHUNKS):
+            # Parse chunk_ids JSON string back to list
+            chunk_ids = response.get("chunk_ids", [])
+            if isinstance(chunk_ids, str):
+                try:
+                    chunk_ids = json.loads(chunk_ids)
+                except json.JSONDecodeError:
+                    chunk_ids = []
+            response["chunk_ids"] = chunk_ids
+            create_time = response.get("create_time", 0)
+            update_time = response.get("update_time", 0)
+            response["create_time"] = create_time
+            response["update_time"] = create_time if update_time == 0 else update_time
+
+        # Special handling for RELATION_CHUNKS namespace
+        if response and is_namespace(
+            self.namespace, NameSpace.KV_STORE_RELATION_CHUNKS
+        ):
+            # Parse chunk_ids JSON string back to list
+            chunk_ids = response.get("chunk_ids", [])
+            if isinstance(chunk_ids, str):
+                try:
+                    chunk_ids = json.loads(chunk_ids)
+                except json.JSONDecodeError:
+                    chunk_ids = []
+            response["chunk_ids"] = chunk_ids
+            create_time = response.get("create_time", 0)
+            update_time = response.get("update_time", 0)
+            response["create_time"] = create_time
+            response["update_time"] = create_time if update_time == 0 else update_time
+
         return response if response else None
 
     # Query by id
@@ -1946,6 +1871,38 @@ class PGKVStorage(BaseKVStorage):
                 result["create_time"] = create_time
                 result["update_time"] = create_time if update_time == 0 else update_time
 
+        # Special handling for ENTITY_CHUNKS namespace
+        if results and is_namespace(self.namespace, NameSpace.KV_STORE_ENTITY_CHUNKS):
+            for result in results:
+                # Parse chunk_ids JSON string back to list
+                chunk_ids = result.get("chunk_ids", [])
+                if isinstance(chunk_ids, str):
+                    try:
+                        chunk_ids = json.loads(chunk_ids)
+                    except json.JSONDecodeError:
+                        chunk_ids = []
+                result["chunk_ids"] = chunk_ids
+                create_time = result.get("create_time", 0)
+                update_time = result.get("update_time", 0)
+                result["create_time"] = create_time
+                result["update_time"] = create_time if update_time == 0 else update_time
+
+        # Special handling for RELATION_CHUNKS namespace
+        if results and is_namespace(self.namespace, NameSpace.KV_STORE_RELATION_CHUNKS):
+            for result in results:
+                # Parse chunk_ids JSON string back to list
+                chunk_ids = result.get("chunk_ids", [])
+                if isinstance(chunk_ids, str):
+                    try:
+                        chunk_ids = json.loads(chunk_ids)
+                    except json.JSONDecodeError:
+                        chunk_ids = []
+                result["chunk_ids"] = chunk_ids
+                create_time = result.get("create_time", 0)
+                update_time = result.get("update_time", 0)
+                result["create_time"] = create_time
+                result["update_time"] = create_time if update_time == 0 else update_time
+
         return _order_results(results)
 
     async def filter_keys(self, keys: set[str]) -> set[str]:
@@ -2050,11 +2007,61 @@ class PGKVStorage(BaseKVStorage):
                     "update_time": current_time,
                 }
                 await self.db.execute(upsert_sql, _data)
+        elif is_namespace(self.namespace, NameSpace.KV_STORE_ENTITY_CHUNKS):
+            # Get current UTC time and convert to naive datetime for database storage
+            current_time = datetime.datetime.now(timezone.utc).replace(tzinfo=None)
+            for k, v in data.items():
+                upsert_sql = SQL_TEMPLATES["upsert_entity_chunks"]
+                _data = {
+                    "workspace": self.workspace,
+                    "id": k,
+                    "chunk_ids": json.dumps(v["chunk_ids"]),
+                    "count": v["count"],
+                    "create_time": current_time,
+                    "update_time": current_time,
+                }
+                await self.db.execute(upsert_sql, _data)
+        elif is_namespace(self.namespace, NameSpace.KV_STORE_RELATION_CHUNKS):
+            # Get current UTC time and convert to naive datetime for database storage
+            current_time = datetime.datetime.now(timezone.utc).replace(tzinfo=None)
+            for k, v in data.items():
+                upsert_sql = SQL_TEMPLATES["upsert_relation_chunks"]
+                _data = {
+                    "workspace": self.workspace,
+                    "id": k,
+                    "chunk_ids": json.dumps(v["chunk_ids"]),
+                    "count": v["count"],
+                    "create_time": current_time,
+                    "update_time": current_time,
+                }
+                await self.db.execute(upsert_sql, _data)
 
     async def index_done_callback(self) -> None:
         # PG handles persistence automatically
         pass
 
+    async def is_empty(self) -> bool:
+        """Check if the storage is empty for the current workspace and namespace
+
+        Returns:
+            bool: True if storage is empty, False otherwise
+        """
+        table_name = namespace_to_table_name(self.namespace)
+        if not table_name:
+            logger.error(
+                f"[{self.workspace}] Unknown namespace for is_empty check: {self.namespace}"
+            )
+            return True
+
+        sql = f"SELECT EXISTS(SELECT 1 FROM {table_name} WHERE workspace=$1 LIMIT 1) as has_data"
+
+        try:
+            result = await self.db.query(sql, [self.workspace])
+            return not result.get("has_data", False) if result else True
+        except Exception as e:
+            logger.error(f"[{self.workspace}] Error checking if storage is empty: {e}")
+            return True
+
     async def delete(self, ids: list[str]) -> None:
         """Delete specific records from storage by their IDs
 
@@ -2970,6 +2977,28 @@ class PGDocStatusStorage(DocStatusStorage):
         # PG handles persistence automatically
         pass
 
+    async def is_empty(self) -> bool:
+        """Check if the storage is empty for the current workspace and namespace
+
+        Returns:
+            bool: True if storage is empty, False otherwise
+        """
+        table_name = namespace_to_table_name(self.namespace)
+        if not table_name:
+            logger.error(
+                f"[{self.workspace}] Unknown namespace for is_empty check: {self.namespace}"
+            )
+            return True
+
+        sql = f"SELECT EXISTS(SELECT 1 FROM {table_name} WHERE workspace=$1 LIMIT 1) as has_data"
+
+        try:
+            result = await self.db.query(sql, [self.workspace])
+            return not result.get("has_data", False) if result else True
+        except Exception as e:
+            logger.error(f"[{self.workspace}] Error checking if storage is empty: {e}")
+            return True
+
     async def delete(self, ids: list[str]) -> None:
         """Delete specific records from storage by their IDs
 
@@ -4721,6 +4750,8 @@ NAMESPACE_TABLE_MAP = {
     NameSpace.KV_STORE_TEXT_CHUNKS: "LIGHTRAG_DOC_CHUNKS",
     NameSpace.KV_STORE_FULL_ENTITIES: "LIGHTRAG_FULL_ENTITIES",
     NameSpace.KV_STORE_FULL_RELATIONS: "LIGHTRAG_FULL_RELATIONS",
+    NameSpace.KV_STORE_ENTITY_CHUNKS: "LIGHTRAG_ENTITY_CHUNKS",
+    NameSpace.KV_STORE_RELATION_CHUNKS: "LIGHTRAG_RELATION_CHUNKS",
     NameSpace.KV_STORE_LLM_RESPONSE_CACHE: "LIGHTRAG_LLM_CACHE",
     NameSpace.VECTOR_STORE_CHUNKS: "LIGHTRAG_VDB_CHUNKS",
     NameSpace.VECTOR_STORE_ENTITIES: "LIGHTRAG_VDB_ENTITY",
@@ -4861,6 +4892,28 @@ TABLES = {
                     CONSTRAINT LIGHTRAG_FULL_RELATIONS_PK PRIMARY KEY (workspace, id)
                     )"""
     },
+    "LIGHTRAG_ENTITY_CHUNKS": {
+        "ddl": """CREATE TABLE LIGHTRAG_ENTITY_CHUNKS (
+                    id VARCHAR(512),
+                    workspace VARCHAR(255),
+                    chunk_ids JSONB,
+                    count INTEGER,
+                    create_time TIMESTAMP(0) DEFAULT CURRENT_TIMESTAMP,
+                    update_time TIMESTAMP(0) DEFAULT CURRENT_TIMESTAMP,
+                    CONSTRAINT LIGHTRAG_ENTITY_CHUNKS_PK PRIMARY KEY (workspace, id)
+                    )"""
+    },
+    "LIGHTRAG_RELATION_CHUNKS": {
+        "ddl": """CREATE TABLE LIGHTRAG_RELATION_CHUNKS (
+                    id VARCHAR(512),
+                    workspace VARCHAR(255),
+                    chunk_ids JSONB,
+                    count INTEGER,
+                    create_time TIMESTAMP(0) DEFAULT CURRENT_TIMESTAMP,
+                    update_time TIMESTAMP(0) DEFAULT CURRENT_TIMESTAMP,
+                    CONSTRAINT LIGHTRAG_RELATION_CHUNKS_PK PRIMARY KEY (workspace, id)
+                    )"""
+    },
 }
 
 
@@ -4918,6 +4971,26 @@ SQL_TEMPLATES = {
                                  EXTRACT(EPOCH FROM update_time)::BIGINT as update_time
                                  FROM LIGHTRAG_FULL_RELATIONS WHERE workspace=$1 AND id = ANY($2)
                                 """,
+    "get_by_id_entity_chunks": """SELECT id, chunk_ids, count,
+                                EXTRACT(EPOCH FROM create_time)::BIGINT as create_time,
+                                EXTRACT(EPOCH FROM update_time)::BIGINT as update_time
+                                FROM LIGHTRAG_ENTITY_CHUNKS WHERE workspace=$1 AND id=$2
+                               """,
+    "get_by_id_relation_chunks": """SELECT id, chunk_ids, count,
+                                EXTRACT(EPOCH FROM create_time)::BIGINT as create_time,
+                                EXTRACT(EPOCH FROM update_time)::BIGINT as update_time
+                                FROM LIGHTRAG_RELATION_CHUNKS WHERE workspace=$1 AND id=$2
+                               """,
+    "get_by_ids_entity_chunks": """SELECT id, chunk_ids, count,
+                                 EXTRACT(EPOCH FROM create_time)::BIGINT as create_time,
+                                 EXTRACT(EPOCH FROM update_time)::BIGINT as update_time
+                                 FROM LIGHTRAG_ENTITY_CHUNKS WHERE workspace=$1 AND id = ANY($2)
+                                """,
+    "get_by_ids_relation_chunks": """SELECT id, chunk_ids, count,
+                                 EXTRACT(EPOCH FROM create_time)::BIGINT as create_time,
+                                 EXTRACT(EPOCH FROM update_time)::BIGINT as update_time
+                                 FROM LIGHTRAG_RELATION_CHUNKS WHERE workspace=$1 AND id = ANY($2)
+                                """,
     "filter_keys": "SELECT id FROM {table_name} WHERE workspace=$1 AND id IN ({ids})",
     "upsert_doc_full": """INSERT INTO LIGHTRAG_DOC_FULL (id, content, doc_name, workspace)
                         VALUES ($1, $2, $3, $4)
@@ -4965,6 +5038,22 @@ SQL_TEMPLATES = {
                       count=EXCLUDED.count,
                       update_time = EXCLUDED.update_time
                      """,
+    "upsert_entity_chunks": """INSERT INTO LIGHTRAG_ENTITY_CHUNKS (workspace, id, chunk_ids, count,
+                      create_time, update_time)
+                      VALUES ($1, $2, $3, $4, $5, $6)
+                      ON CONFLICT (workspace,id) DO UPDATE
+                      SET chunk_ids=EXCLUDED.chunk_ids,
+                      count=EXCLUDED.count,
+                      update_time = EXCLUDED.update_time
+                     """,
+    "upsert_relation_chunks": """INSERT INTO LIGHTRAG_RELATION_CHUNKS (workspace, id, chunk_ids, count,
+                      create_time, update_time)
+                      VALUES ($1, $2, $3, $4, $5, $6)
+                      ON CONFLICT (workspace,id) DO UPDATE
+                      SET chunk_ids=EXCLUDED.chunk_ids,
+                      count=EXCLUDED.count,
+                      update_time = EXCLUDED.update_time
+                     """,
     # SQL for VectorStorage
     "upsert_chunk": """INSERT INTO LIGHTRAG_VDB_CHUNKS (workspace, id, tokens,
                       chunk_order_index, full_doc_id, content, content_vector, file_path,
diff --git a/lightrag/kg/redis_impl.py b/lightrag/kg/redis_impl.py
index 56569dda..8a393497 100644
--- a/lightrag/kg/redis_impl.py
+++ b/lightrag/kg/redis_impl.py
@@ -304,51 +304,6 @@ class RedisKVStorage(BaseKVStorage):
                 logger.error(f"[{self.workspace}] JSON decode error in batch get: {e}")
                 return [None] * len(ids)
 
-    async def get_all(self) -> dict[str, Any]:
-        """Get all data from storage
-
-        Returns:
-            Dictionary containing all stored data
-        """
-        async with self._get_redis_connection() as redis:
-            try:
-                # Get all keys for this namespace
-                keys = await redis.keys(f"{self.final_namespace}:*")
-
-                if not keys:
-                    return {}
-
-                # Get all values in batch
-                pipe = redis.pipeline()
-                for key in keys:
-                    pipe.get(key)
-                values = await pipe.execute()
-
-                # Build result dictionary
-                result = {}
-                for key, value in zip(keys, values):
-                    if value:
-                        # Extract the ID part (after namespace:)
-                        key_id = key.split(":", 1)[1]
-                        try:
-                            data = json.loads(value)
-                            # Ensure time fields are present for all documents
-                            data.setdefault("create_time", 0)
-                            data.setdefault("update_time", 0)
-                            result[key_id] = data
-                        except json.JSONDecodeError as e:
-                            logger.error(
-                                f"[{self.workspace}] JSON decode error for key {key}: {e}"
-                            )
-                            continue
-
-                return result
-            except Exception as e:
-                logger.error(
-                    f"[{self.workspace}] Error getting all data from Redis: {e}"
-                )
-                return {}
-
     async def filter_keys(self, keys: set[str]) -> set[str]:
         async with self._get_redis_connection() as redis:
             pipe = redis.pipeline()
@@ -407,8 +362,24 @@ class RedisKVStorage(BaseKVStorage):
         # Redis handles persistence automatically
         pass
 
+    async def is_empty(self) -> bool:
+        """Check if the storage is empty for the current workspace and namespace
+
+        Returns:
+            bool: True if storage is empty, False otherwise
+        """
+        pattern = f"{self.namespace}:{self.workspace}:*"
+        try:
+            # Use scan to check if any keys exist
+            async for key in self.redis.scan_iter(match=pattern, count=1):
+                return False  # Found at least one key
+            return True  # No keys found
+        except Exception as e:
+            logger.error(f"[{self.workspace}] Error checking if storage is empty: {e}")
+            return True
+
     async def delete(self, ids: list[str]) -> None:
-        """Delete entries with specified IDs"""
+        """Delete specific records from storage by their IDs"""
         if not ids:
             return
 
@@ -868,6 +839,23 @@ class RedisDocStatusStorage(DocStatusStorage):
         """Redis handles persistence automatically"""
         pass
 
+    async def is_empty(self) -> bool:
+        """Check if the storage is empty for the current workspace and namespace
+
+        Returns:
+            bool: True if storage is empty, False otherwise
+        """
+        pattern = f"{self.final_namespace}:*"
+        try:
+            async with self._get_redis_connection() as redis:
+                # Use scan to check if any keys exist
+                async for key in redis.scan_iter(match=pattern, count=1):
+                    return False  # Found at least one key
+                return True  # No keys found
+        except Exception as e:
+            logger.error(f"[{self.workspace}] Error checking if storage is empty: {e}")
+            return True
+
     @redis_retry
     async def upsert(self, data: dict[str, dict[str, Any]]) -> None:
         """Insert or update document status data"""
diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py
index 056b5bca..1f32da50 100644
--- a/lightrag/lightrag.py
+++ b/lightrag/lightrag.py
@@ -41,10 +41,12 @@ from lightrag.constants import (
     DEFAULT_MAX_PARALLEL_INSERT,
     DEFAULT_MAX_GRAPH_NODES,
     DEFAULT_MAX_SOURCE_IDS_PER_ENTITY,
+    DEFAULT_MAX_SOURCE_IDS_PER_RELATION,
     DEFAULT_ENTITY_TYPES,
     DEFAULT_SUMMARY_LANGUAGE,
     DEFAULT_LLM_TIMEOUT,
     DEFAULT_EMBEDDING_TIMEOUT,
+    DEFAULT_SOURCE_IDS_LIMIT_METHOD,
 )
 from lightrag.utils import get_env_value
 
@@ -99,6 +101,9 @@ from lightrag.utils import (
     generate_track_id,
     convert_to_user_format,
     logger,
+    subtract_source_ids,
+    make_relation_chunk_key,
+    normalize_source_ids_limit_method,
 )
 from lightrag.types import KnowledgeGraph
 from dotenv import load_dotenv
@@ -362,10 +367,32 @@ class LightRAG:
     """Maximum number of graph nodes to return in knowledge graph queries."""
 
     max_source_ids_per_entity: int = field(
-        default=get_env_value("MAX_SOURCE_IDS_PER_ENTITY", DEFAULT_MAX_SOURCE_IDS_PER_ENTITY, int)
+        default=get_env_value(
+            "MAX_SOURCE_IDS_PER_ENTITY", DEFAULT_MAX_SOURCE_IDS_PER_ENTITY, int
+        )
     )
     """Maximum number of source (chunk) ids in entity Grpah + VDB."""
 
+    max_source_ids_per_relation: int = field(
+        default=get_env_value(
+            "MAX_SOURCE_IDS_PER_RELATION",
+            DEFAULT_MAX_SOURCE_IDS_PER_RELATION,
+            int,
+        )
+    )
+    """Maximum number of source (chunk) ids in relation Graph + VDB."""
+
+    source_ids_limit_method: str = field(
+        default_factory=lambda: normalize_source_ids_limit_method(
+            get_env_value(
+                "SOURCE_IDS_LIMIT_METHOD",
+                DEFAULT_SOURCE_IDS_LIMIT_METHOD,
+                str,
+            )
+        )
+    )
+    """Strategy for enforcing source_id limits: IGNORE_NEW or FIFO."""
+
     addon_params: dict[str, Any] = field(
         default_factory=lambda: {
             "language": get_env_value(
@@ -535,6 +562,18 @@ class LightRAG:
             embedding_func=self.embedding_func,
         )
 
+        self.entity_chunks: BaseKVStorage = self.key_string_value_json_storage_cls(  # type: ignore
+            namespace=NameSpace.KV_STORE_ENTITY_CHUNKS,
+            workspace=self.workspace,
+            embedding_func=self.embedding_func,
+        )
+
+        self.relation_chunks: BaseKVStorage = self.key_string_value_json_storage_cls(  # type: ignore
+            namespace=NameSpace.KV_STORE_RELATION_CHUNKS,
+            workspace=self.workspace,
+            embedding_func=self.embedding_func,
+        )
+
         self.chunk_entity_relation_graph: BaseGraphStorage = self.graph_storage_cls(  # type: ignore
             namespace=NameSpace.GRAPH_STORE_CHUNK_ENTITY_RELATION,
             workspace=self.workspace,
@@ -594,6 +633,8 @@ class LightRAG:
                 self.text_chunks,
                 self.full_entities,
                 self.full_relations,
+                self.entity_chunks,
+                self.relation_chunks,
                 self.entities_vdb,
                 self.relationships_vdb,
                 self.chunks_vdb,
@@ -616,6 +657,8 @@ class LightRAG:
                 ("text_chunks", self.text_chunks),
                 ("full_entities", self.full_entities),
                 ("full_relations", self.full_relations),
+                ("entity_chunks", self.entity_chunks),
+                ("relation_chunks", self.relation_chunks),
                 ("entities_vdb", self.entities_vdb),
                 ("relationships_vdb", self.relationships_vdb),
                 ("chunks_vdb", self.chunks_vdb),
@@ -671,6 +714,13 @@ class LightRAG:
                     logger.debug("No entities found in graph, skipping migration check")
                     return
 
+                try:
+                    # Initialize chunk tracking storage after migration
+                    await self._migrate_chunk_tracking_storage()
+                except Exception as e:
+                    logger.error(f"Error during chunk_tracking migration: {e}")
+                    raise e
+
                 # Check if full_entities and full_relations are empty
                 # Get all processed documents to check their entity/relation data
                 try:
@@ -711,11 +761,11 @@ class LightRAG:
 
                 except Exception as e:
                     logger.error(f"Error during migration check: {e}")
-                    # Don't raise the error, just log it to avoid breaking initialization
+                    raise e
 
             except Exception as e:
                 logger.error(f"Error in data migration check: {e}")
-                # Don't raise the error to avoid breaking initialization
+                raise e
 
     async def _migrate_entity_relation_data(self, processed_docs: dict):
         """Migrate existing entity and relation data to full_entities and full_relations storage"""
@@ -814,6 +864,140 @@ class LightRAG:
             f"Data migration completed: migrated {migration_count} documents with entities/relations"
         )
 
+    async def _migrate_chunk_tracking_storage(self) -> None:
+        """Ensure entity/relation chunk tracking KV stores exist and are seeded."""
+
+        if not self.entity_chunks or not self.relation_chunks:
+            return
+
+        need_entity_migration = False
+        need_relation_migration = False
+
+        try:
+            need_entity_migration = await self.entity_chunks.is_empty()
+        except Exception as exc:  # pragma: no cover - defensive logging
+            logger.error(f"Failed to check entity chunks storage: {exc}")
+            need_entity_migration = True
+
+        try:
+            need_relation_migration = await self.relation_chunks.is_empty()
+        except Exception as exc:  # pragma: no cover - defensive logging
+            logger.error(f"Failed to check relation chunks storage: {exc}")
+            need_relation_migration = True
+
+        if not need_entity_migration and not need_relation_migration:
+            return
+
+        BATCH_SIZE = 500  # Process 500 records per batch
+
+        if need_entity_migration:
+            try:
+                nodes = await self.chunk_entity_relation_graph.get_all_nodes()
+            except Exception as exc:
+                logger.error(f"Failed to fetch nodes for chunk migration: {exc}")
+                nodes = []
+
+            logger.info(f"Starting chunk_tracking data migration: {len(nodes)} nodes")
+
+            # Process nodes in batches
+            total_nodes = len(nodes)
+            total_batches = (total_nodes + BATCH_SIZE - 1) // BATCH_SIZE
+            total_migrated = 0
+
+            for batch_idx in range(total_batches):
+                start_idx = batch_idx * BATCH_SIZE
+                end_idx = min((batch_idx + 1) * BATCH_SIZE, total_nodes)
+                batch_nodes = nodes[start_idx:end_idx]
+
+                upsert_payload: dict[str, dict[str, object]] = {}
+                for node in batch_nodes:
+                    entity_id = node.get("entity_id") or node.get("id")
+                    if not entity_id:
+                        continue
+
+                    raw_source = node.get("source_id") or ""
+                    chunk_ids = [
+                        chunk_id
+                        for chunk_id in raw_source.split(GRAPH_FIELD_SEP)
+                        if chunk_id
+                    ]
+                    if not chunk_ids:
+                        continue
+
+                    upsert_payload[entity_id] = {
+                        "chunk_ids": chunk_ids,
+                        "count": len(chunk_ids),
+                    }
+
+                if upsert_payload:
+                    await self.entity_chunks.upsert(upsert_payload)
+                    total_migrated += len(upsert_payload)
+                    logger.info(
+                        f"Processed entity batch {batch_idx + 1}/{total_batches}: {len(upsert_payload)} records (total: {total_migrated}/{total_nodes})"
+                    )
+
+            if total_migrated > 0:
+                # Persist entity_chunks data to disk
+                await self.entity_chunks.index_done_callback()
+                logger.info(
+                    f"Entity chunk_tracking migration completed: {total_migrated} records persisted"
+                )
+
+        if need_relation_migration:
+            try:
+                edges = await self.chunk_entity_relation_graph.get_all_edges()
+            except Exception as exc:
+                logger.error(f"Failed to fetch edges for chunk migration: {exc}")
+                edges = []
+
+            logger.info(f"Starting chunk_tracking data migration: {len(edges)} edges")
+
+            # Process edges in batches
+            total_edges = len(edges)
+            total_batches = (total_edges + BATCH_SIZE - 1) // BATCH_SIZE
+            total_migrated = 0
+
+            for batch_idx in range(total_batches):
+                start_idx = batch_idx * BATCH_SIZE
+                end_idx = min((batch_idx + 1) * BATCH_SIZE, total_edges)
+                batch_edges = edges[start_idx:end_idx]
+
+                upsert_payload: dict[str, dict[str, object]] = {}
+                for edge in batch_edges:
+                    src = edge.get("source") or edge.get("src_id") or edge.get("src")
+                    tgt = edge.get("target") or edge.get("tgt_id") or edge.get("tgt")
+                    if not src or not tgt:
+                        continue
+
+                    raw_source = edge.get("source_id") or ""
+                    chunk_ids = [
+                        chunk_id
+                        for chunk_id in raw_source.split(GRAPH_FIELD_SEP)
+                        if chunk_id
+                    ]
+                    if not chunk_ids:
+                        continue
+
+                    storage_key = make_relation_chunk_key(src, tgt)
+                    upsert_payload[storage_key] = {
+                        "chunk_ids": chunk_ids,
+                        "count": len(chunk_ids),
+                    }
+
+                if upsert_payload:
+                    await self.relation_chunks.upsert(upsert_payload)
+                    total_migrated += len(upsert_payload)
+                    logger.info(
+                        f"Processed relation batch {batch_idx + 1}/{total_batches}: {len(upsert_payload)} records (total: {total_migrated}/{total_edges})"
+                    )
+
+            if total_migrated > 0:
+                # Persist relation_chunks data to disk
+                await self.relation_chunks.index_done_callback()
+                logger.info(
+                    f"Relation chunk_tracking migration completed: {total_migrated} records persisted"
+                )
+
     async def get_graph_labels(self):
         text = await self.chunk_entity_relation_graph.get_all_labels()
         return text
@@ -1676,6 +1860,8 @@ class LightRAG:
                                     pipeline_status=pipeline_status,
                                     pipeline_status_lock=pipeline_status_lock,
                                     llm_response_cache=self.llm_response_cache,
+                                    entity_chunks_storage=self.entity_chunks,
+                                    relation_chunks_storage=self.relation_chunks,
                                     current_file_number=current_file_number,
                                     total_files=total_files,
                                     file_path=file_path,
@@ -1845,6 +2031,8 @@ class LightRAG:
                 self.text_chunks,
                 self.full_entities,
                 self.full_relations,
+                self.entity_chunks,
+                self.relation_chunks,
                 self.llm_response_cache,
                 self.entities_vdb,
                 self.relationships_vdb,
@@ -2718,9 +2906,11 @@ class LightRAG:
 
             # 4. Analyze entities and relationships that will be affected
             entities_to_delete = set()
-            entities_to_rebuild = {}  # entity_name -> remaining_chunk_ids
+            entities_to_rebuild = {}  # entity_name -> remaining chunk id list
             relationships_to_delete = set()
-            relationships_to_rebuild = {}  # (src, tgt) -> remaining_chunk_ids
+            relationships_to_rebuild = {}  # (src, tgt) -> remaining chunk id list
+            entity_chunk_updates: dict[str, list[str]] = {}
+            relation_chunk_updates: dict[tuple[str, str], list[str]] = {}
 
             try:
                 # Get affected entities and relations from full_entities and full_relations storage
@@ -2776,14 +2966,41 @@ class LightRAG:
                 # Process entities
                 for node_data in affected_nodes:
                     node_label = node_data.get("entity_id")
-                    if node_label and "source_id" in node_data:
-                        sources = set(node_data["source_id"].split(GRAPH_FIELD_SEP))
-                        remaining_sources = sources - chunk_ids
+                    if not node_label:
+                        continue
 
-                        if not remaining_sources:
-                            entities_to_delete.add(node_label)
-                        elif remaining_sources != sources:
-                            entities_to_rebuild[node_label] = remaining_sources
+                    existing_sources: list[str] = []
+                    if self.entity_chunks:
+                        stored_chunks = await self.entity_chunks.get_by_id(node_label)
+                        if stored_chunks and isinstance(stored_chunks, dict):
+                            existing_sources = [
+                                chunk_id
+                                for chunk_id in stored_chunks.get("chunk_ids", [])
+                                if chunk_id
+                            ]
+
+                    if not existing_sources and node_data.get("source_id"):
+                        existing_sources = [
+                            chunk_id
+                            for chunk_id in node_data["source_id"].split(
+                                GRAPH_FIELD_SEP
+                            )
+                            if chunk_id
+                        ]
+
+                    if not existing_sources:
+                        continue
+
+                    remaining_sources = subtract_source_ids(existing_sources, chunk_ids)
+
+                    if not remaining_sources:
+                        entities_to_delete.add(node_label)
+                        entity_chunk_updates[node_label] = []
+                    elif remaining_sources != existing_sources:
+                        entities_to_rebuild[node_label] = remaining_sources
+                        entity_chunk_updates[node_label] = remaining_sources
+                    else:
+                        logger.info(f"Untouch entity: {node_label}")
 
                 async with pipeline_status_lock:
                     log_message = f"Found {len(entities_to_rebuild)} affected entities"
@@ -2796,21 +3013,51 @@ class LightRAG:
                     src = edge_data.get("source")
                     tgt = edge_data.get("target")
 
-                    if src and tgt and "source_id" in edge_data:
-                        edge_tuple = tuple(sorted((src, tgt)))
-                        if (
-                            edge_tuple in relationships_to_delete
-                            or edge_tuple in relationships_to_rebuild
-                        ):
-                            continue
+                    if not src or not tgt or "source_id" not in edge_data:
+                        continue
 
-                        sources = set(edge_data["source_id"].split(GRAPH_FIELD_SEP))
-                        remaining_sources = sources - chunk_ids
+                    edge_tuple = tuple(sorted((src, tgt)))
+                    if (
+                        edge_tuple in relationships_to_delete
+                        or edge_tuple in relationships_to_rebuild
+                    ):
+                        continue
 
-                        if not remaining_sources:
-                            relationships_to_delete.add(edge_tuple)
-                        elif remaining_sources != sources:
-                            relationships_to_rebuild[edge_tuple] = remaining_sources
+                    existing_sources: list[str] = []
+                    if self.relation_chunks:
+                        storage_key = make_relation_chunk_key(src, tgt)
+                        stored_chunks = await self.relation_chunks.get_by_id(
+                            storage_key
+                        )
+                        if stored_chunks and isinstance(stored_chunks, dict):
+                            existing_sources = [
+                                chunk_id
+                                for chunk_id in stored_chunks.get("chunk_ids", [])
+                                if chunk_id
+                            ]
+
+                    if not existing_sources:
+                        existing_sources = [
+                            chunk_id
+                            for chunk_id in edge_data["source_id"].split(
+                                GRAPH_FIELD_SEP
+                            )
+                            if chunk_id
+                        ]
+
+                    if not existing_sources:
+                        continue
+
+                    remaining_sources = subtract_source_ids(existing_sources, chunk_ids)
+
+                    if not remaining_sources:
+                        relationships_to_delete.add(edge_tuple)
+                        relation_chunk_updates[edge_tuple] = []
+                    elif remaining_sources != existing_sources:
+                        relationships_to_rebuild[edge_tuple] = remaining_sources
+                        relation_chunk_updates[edge_tuple] = remaining_sources
+                    else:
+                        logger.info(f"Untouch relation: {edge_tuple}")
 
                 async with pipeline_status_lock:
                     log_message = (
@@ -2820,6 +3067,45 @@ class LightRAG:
                     pipeline_status["latest_message"] = log_message
                     pipeline_status["history_messages"].append(log_message)
 
+                current_time = int(time.time())
+
+                if entity_chunk_updates and self.entity_chunks:
+                    entity_upsert_payload = {}
+                    entity_delete_ids: set[str] = set()
+                    for entity_name, remaining in entity_chunk_updates.items():
+                        if not remaining:
+                            entity_delete_ids.add(entity_name)
+                        else:
+                            entity_upsert_payload[entity_name] = {
+                                "chunk_ids": remaining,
+                                "count": len(remaining),
+                                "updated_at": current_time,
+                            }
+
+                    if entity_delete_ids:
+                        await self.entity_chunks.delete(list(entity_delete_ids))
+                    if entity_upsert_payload:
+                        await self.entity_chunks.upsert(entity_upsert_payload)
+
+                if relation_chunk_updates and self.relation_chunks:
+                    relation_upsert_payload = {}
+                    relation_delete_ids: set[str] = set()
+                    for edge_tuple, remaining in relation_chunk_updates.items():
+                        storage_key = make_relation_chunk_key(*edge_tuple)
+                        if not remaining:
+                            relation_delete_ids.add(storage_key)
+                        else:
+                            relation_upsert_payload[storage_key] = {
+                                "chunk_ids": remaining,
+                                "count": len(remaining),
+                                "updated_at": current_time,
+                            }
+
+                    if relation_delete_ids:
+                        await self.relation_chunks.delete(list(relation_delete_ids))
+                    if relation_upsert_payload:
+                        await self.relation_chunks.upsert(relation_upsert_payload)
+
             except Exception as e:
                 logger.error(f"Failed to process graph analysis results: {e}")
                 raise Exception(f"Failed to process graph dependencies: {e}") from e
@@ -2914,6 +3200,8 @@ class LightRAG:
                         global_config=asdict(self),
                         pipeline_status=pipeline_status,
                         pipeline_status_lock=pipeline_status_lock,
+                        entity_chunks_storage=self.entity_chunks,
+                        relation_chunks_storage=self.relation_chunks,
                     )
 
                 except Exception as e:
diff --git a/lightrag/namespace.py b/lightrag/namespace.py
index 2acfe9a4..eccd168d 100644
--- a/lightrag/namespace.py
+++ b/lightrag/namespace.py
@@ -10,6 +10,8 @@ class NameSpace:
     KV_STORE_LLM_RESPONSE_CACHE = "llm_response_cache"
     KV_STORE_FULL_ENTITIES = "full_entities"
     KV_STORE_FULL_RELATIONS = "full_relations"
+    KV_STORE_ENTITY_CHUNKS = "entity_chunks"
+    KV_STORE_RELATION_CHUNKS = "relation_chunks"
 
     VECTOR_STORE_ENTITIES = "entities"
     VECTOR_STORE_RELATIONSHIPS = "relationships"
diff --git a/lightrag/operate.py b/lightrag/operate.py
index c27f417d..2f7f6340 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -7,7 +7,7 @@ import json_repair
 from typing import Any, AsyncIterator, overload, Literal
 from collections import Counter, defaultdict
 
-from .utils import (
+from lightrag.utils import (
     logger,
     compute_mdhash_id,
     Tokenizer,
@@ -27,14 +27,16 @@ from .utils import (
     pick_by_vector_similarity,
     process_chunks_unified,
     build_file_path,
-    truncate_entity_source_id,
     safe_vdb_operation_with_exception,
     create_prefixed_exception,
     fix_tuple_delimiter_corruption,
     convert_to_user_format,
     generate_reference_list_from_chunks,
+    apply_source_ids_limit,
+    merge_source_ids,
+    make_relation_chunk_key,
 )
-from .base import (
+from lightrag.base import (
     BaseGraphStorage,
     BaseKVStorage,
     BaseVectorStorage,
@@ -43,8 +45,8 @@ from .base import (
     QueryResult,
     QueryContextResult,
 )
-from .prompt import PROMPTS
-from .constants import (
+from lightrag.prompt import PROMPTS
+from lightrag.constants import (
     GRAPH_FIELD_SEP,
     DEFAULT_MAX_ENTITY_TOKENS,
     DEFAULT_MAX_RELATION_TOKENS,
@@ -53,8 +55,9 @@ from .constants import (
     DEFAULT_KG_CHUNK_PICK_METHOD,
     DEFAULT_ENTITY_TYPES,
     DEFAULT_SUMMARY_LANGUAGE,
+    SOURCE_IDS_LIMIT_METHOD_KEEP,
 )
-from .kg.shared_storage import get_storage_keyed_lock
+from lightrag.kg.shared_storage import get_storage_keyed_lock
 import time
 from dotenv import load_dotenv
 
@@ -474,8 +477,8 @@ async def _handle_single_relationship_extraction(
 
 
 async def _rebuild_knowledge_from_chunks(
-    entities_to_rebuild: dict[str, set[str]],
-    relationships_to_rebuild: dict[tuple[str, str], set[str]],
+    entities_to_rebuild: dict[str, list[str]],
+    relationships_to_rebuild: dict[tuple[str, str], list[str]],
     knowledge_graph_inst: BaseGraphStorage,
     entities_vdb: BaseVectorStorage,
     relationships_vdb: BaseVectorStorage,
@@ -484,6 +487,8 @@ async def _rebuild_knowledge_from_chunks(
     global_config: dict[str, str],
     pipeline_status: dict | None = None,
     pipeline_status_lock=None,
+    entity_chunks_storage: BaseKVStorage | None = None,
+    relation_chunks_storage: BaseKVStorage | None = None,
 ) -> None:
     """Rebuild entity and relationship descriptions from cached extraction results with parallel processing
 
@@ -492,8 +497,8 @@ async def _rebuild_knowledge_from_chunks(
     controlled by llm_model_max_async and using get_storage_keyed_lock for data consistency.
 
     Args:
-        entities_to_rebuild: Dict mapping entity_name -> set of remaining chunk_ids
-        relationships_to_rebuild: Dict mapping (src, tgt) -> set of remaining chunk_ids
+        entities_to_rebuild: Dict mapping entity_name -> list of remaining chunk_ids
+        relationships_to_rebuild: Dict mapping (src, tgt) -> list of remaining chunk_ids
         knowledge_graph_inst: Knowledge graph storage
         entities_vdb: Entity vector database
         relationships_vdb: Relationship vector database
@@ -502,6 +507,8 @@ async def _rebuild_knowledge_from_chunks(
         global_config: Global configuration containing llm_model_max_async
         pipeline_status: Pipeline status dictionary
         pipeline_status_lock: Lock for pipeline status
+        entity_chunks_storage: KV storage maintaining full chunk IDs per entity
+        relation_chunks_storage: KV storage maintaining full chunk IDs per relation
     """
     if not entities_to_rebuild and not relationships_to_rebuild:
         return
@@ -641,10 +648,11 @@ async def _rebuild_knowledge_from_chunks(
                         chunk_entities=chunk_entities,
                         llm_response_cache=llm_response_cache,
                         global_config=global_config,
+                        entity_chunks_storage=entity_chunks_storage,
                     )
                     rebuilt_entities_count += 1
                     status_message = (
-                        f"Rebuilt `{entity_name}` from {len(chunk_ids)} chunks"
+                        f"Rebuild `{entity_name}` from {len(chunk_ids)} chunks"
                     )
                     logger.info(status_message)
                     if pipeline_status is not None and pipeline_status_lock is not None:
@@ -682,16 +690,11 @@ async def _rebuild_knowledge_from_chunks(
                         chunk_relationships=chunk_relationships,
                         llm_response_cache=llm_response_cache,
                         global_config=global_config,
+                        relation_chunks_storage=relation_chunks_storage,
+                        pipeline_status=pipeline_status,
+                        pipeline_status_lock=pipeline_status_lock,
                     )
                     rebuilt_relationships_count += 1
-                    status_message = (
-                        f"Rebuilt `{src} - {tgt}` from {len(chunk_ids)} chunks"
-                    )
-                    logger.info(status_message)
-                    if pipeline_status is not None and pipeline_status_lock is not None:
-                        async with pipeline_status_lock:
-                            pipeline_status["latest_message"] = status_message
-                            pipeline_status["history_messages"].append(status_message)
                 except Exception as e:
                     failed_relationships_count += 1
                     status_message = f"Failed to rebuild `{src} - {tgt}`: {e}"
@@ -1002,10 +1005,13 @@ async def _rebuild_single_entity(
     knowledge_graph_inst: BaseGraphStorage,
     entities_vdb: BaseVectorStorage,
     entity_name: str,
-    chunk_ids: set[str],
+    chunk_ids: list[str],
     chunk_entities: dict,
     llm_response_cache: BaseKVStorage,
     global_config: dict[str, str],
+    entity_chunks_storage: BaseKVStorage | None = None,
+    pipeline_status: dict | None = None,
+    pipeline_status_lock=None,
 ) -> None:
     """Rebuild a single entity from cached extraction results"""
 
@@ -1016,7 +1022,11 @@ async def _rebuild_single_entity(
 
     # Helper function to update entity in both graph and vector storage
     async def _update_entity_storage(
-        final_description: str, entity_type: str, file_paths: set[str]
+        final_description: str,
+        entity_type: str,
+        file_paths: set[str],
+        source_chunk_ids: list[str],
+        truncation_info: str = "",
     ):
         try:
             # Update entity in graph storage (critical path)
@@ -1024,10 +1034,12 @@ async def _rebuild_single_entity(
                 **current_entity,
                 "description": final_description,
                 "entity_type": entity_type,
-                "source_id": GRAPH_FIELD_SEP.join(chunk_ids),
+                "source_id": GRAPH_FIELD_SEP.join(source_chunk_ids),
                 "file_path": GRAPH_FIELD_SEP.join(file_paths)
                 if file_paths
                 else current_entity.get("file_path", "unknown_source"),
+                "created_at": int(time.time()),
+                "truncate": truncation_info,
             }
             await knowledge_graph_inst.upsert_node(entity_name, updated_entity_data)
 
@@ -1060,9 +1072,33 @@ async def _rebuild_single_entity(
             logger.error(error_msg)
             raise  # Re-raise exception
 
-    # Collect all entity data from relevant chunks
+    # normalized_chunk_ids = merge_source_ids([], chunk_ids)
+    normalized_chunk_ids = chunk_ids
+
+    if entity_chunks_storage is not None and normalized_chunk_ids:
+        await entity_chunks_storage.upsert(
+            {
+                entity_name: {
+                    "chunk_ids": normalized_chunk_ids,
+                    "count": len(normalized_chunk_ids),
+                }
+            }
+        )
+
+    limit_method = (
+        global_config.get("source_ids_limit_method") or SOURCE_IDS_LIMIT_METHOD_KEEP
+    )
+
+    limited_chunk_ids = apply_source_ids_limit(
+        normalized_chunk_ids,
+        global_config["max_source_ids_per_entity"],
+        limit_method,
+        identifier=f"`{entity_name}`",
+    )
+
+    # Collect all entity data from relevant (limited) chunks
     all_entity_data = []
-    for chunk_id in chunk_ids:
+    for chunk_id in limited_chunk_ids:
         if chunk_id in chunk_entities and entity_name in chunk_entities[chunk_id]:
             all_entity_data.extend(chunk_entities[chunk_id][entity_name])
 
@@ -1109,7 +1145,12 @@ async def _rebuild_single_entity(
             final_description = current_entity.get("description", "")
 
         entity_type = current_entity.get("entity_type", "UNKNOWN")
-        await _update_entity_storage(final_description, entity_type, file_paths)
+        await _update_entity_storage(
+            final_description,
+            entity_type,
+            file_paths,
+            limited_chunk_ids,
+        )
         return
 
     # Process cached entity data
@@ -1149,7 +1190,31 @@ async def _rebuild_single_entity(
     else:
         final_description = current_entity.get("description", "")
 
-    await _update_entity_storage(final_description, entity_type, file_paths)
+    if len(limited_chunk_ids) < len(normalized_chunk_ids):
+        truncation_info = (
+            f"{limit_method}:{len(limited_chunk_ids)}/{len(normalized_chunk_ids)}"
+        )
+    else:
+        truncation_info = ""
+
+    await _update_entity_storage(
+        final_description,
+        entity_type,
+        file_paths,
+        limited_chunk_ids,
+        truncation_info,
+    )
+
+    # Log rebuild completion with truncation info
+    status_message = f"Rebuild `{entity_name}` from {len(chunk_ids)} chunks"
+    if truncation_info:
+        status_message += f" ({truncation_info})"
+    logger.info(status_message)
+    # Update pipeline status
+    if pipeline_status is not None and pipeline_status_lock is not None:
+        async with pipeline_status_lock:
+            pipeline_status["latest_message"] = status_message
+            pipeline_status["history_messages"].append(status_message)
 
 
 async def _rebuild_single_relationship(
@@ -1157,10 +1222,13 @@ async def _rebuild_single_relationship(
     relationships_vdb: BaseVectorStorage,
     src: str,
     tgt: str,
-    chunk_ids: set[str],
+    chunk_ids: list[str],
     chunk_relationships: dict,
     llm_response_cache: BaseKVStorage,
     global_config: dict[str, str],
+    relation_chunks_storage: BaseKVStorage | None = None,
+    pipeline_status: dict | None = None,
+    pipeline_status_lock=None,
 ) -> None:
     """Rebuild a single relationship from cached extraction results
 
@@ -1173,9 +1241,33 @@ async def _rebuild_single_relationship(
     if not current_relationship:
         return
 
+    # normalized_chunk_ids = merge_source_ids([], chunk_ids)
+    normalized_chunk_ids = chunk_ids
+
+    if relation_chunks_storage is not None and normalized_chunk_ids:
+        storage_key = make_relation_chunk_key(src, tgt)
+        await relation_chunks_storage.upsert(
+            {
+                storage_key: {
+                    "chunk_ids": normalized_chunk_ids,
+                    "count": len(normalized_chunk_ids),
+                }
+            }
+        )
+
+    limit_method = (
+        global_config.get("source_ids_limit_method") or SOURCE_IDS_LIMIT_METHOD_KEEP
+    )
+    limited_chunk_ids = apply_source_ids_limit(
+        normalized_chunk_ids,
+        global_config["max_source_ids_per_relation"],
+        limit_method,
+        identifier=f"`{src}`~`{tgt}`",
+    )
+
     # Collect all relationship data from relevant chunks
     all_relationship_data = []
-    for chunk_id in chunk_ids:
+    for chunk_id in limited_chunk_ids:
         if chunk_id in chunk_relationships:
             # Check both (src, tgt) and (tgt, src) since relationships can be bidirectional
             for edge_key in [(src, tgt), (tgt, src)]:
@@ -1230,6 +1322,13 @@ async def _rebuild_single_relationship(
         # fallback to keep current(unchanged)
         final_description = current_relationship.get("description", "")
 
+    if len(limited_chunk_ids) < len(normalized_chunk_ids):
+        truncation_info = (
+            f"{limit_method}:{len(limited_chunk_ids)}/{len(normalized_chunk_ids)}"
+        )
+    else:
+        truncation_info = ""
+
     # Update relationship in graph storage
     updated_relationship_data = {
         **current_relationship,
@@ -1238,10 +1337,11 @@ async def _rebuild_single_relationship(
         else current_relationship.get("description", ""),
         "keywords": combined_keywords,
         "weight": weight,
-        "source_id": GRAPH_FIELD_SEP.join(chunk_ids),
+        "source_id": GRAPH_FIELD_SEP.join(limited_chunk_ids),
         "file_path": GRAPH_FIELD_SEP.join([fp for fp in file_paths if fp])
         if file_paths
         else current_relationship.get("file_path", "unknown_source"),
+        "truncate": truncation_info,
     }
     await knowledge_graph_inst.upsert_edge(src, tgt, updated_relationship_data)
 
@@ -1287,6 +1387,25 @@ async def _rebuild_single_relationship(
         logger.error(error_msg)
         raise  # Re-raise exception
 
+    # Log rebuild completion with truncation info
+    status_message = f"Rebuild `{src} - {tgt}` from {len(chunk_ids)} chunks"
+    if truncation_info:
+        status_message += f" ({truncation_info})"
+    # Add truncation info from apply_source_ids_limit if truncation occurred
+    if len(limited_chunk_ids) < len(normalized_chunk_ids):
+        truncation_info = (
+            f" ({limit_method}:{len(limited_chunk_ids)}/{len(normalized_chunk_ids)})"
+        )
+        status_message += truncation_info
+
+    logger.info(status_message)
+
+    # Update pipeline status
+    if pipeline_status is not None and pipeline_status_lock is not None:
+        async with pipeline_status_lock:
+            pipeline_status["latest_message"] = status_message
+            pipeline_status["history_messages"].append(status_message)
+
 
 async def _merge_nodes_then_upsert(
     entity_name: str,
@@ -1296,6 +1415,7 @@ async def _merge_nodes_then_upsert(
     pipeline_status: dict = None,
     pipeline_status_lock=None,
     llm_response_cache: BaseKVStorage | None = None,
+    entity_chunks_storage: BaseKVStorage | None = None,
 ):
     """Get existing nodes from knowledge graph use name,if exists, merge data, else create, then upsert."""
     already_entity_types = []
@@ -1318,10 +1438,78 @@ async def _merge_nodes_then_upsert(
         reverse=True,
     )[0][0]  # Get the entity type with the highest count
 
+    original_nodes_count = len(nodes_data)
+
+    new_source_ids = [dp["source_id"] for dp in nodes_data if dp.get("source_id")]
+
+    existing_full_source_ids = []
+    if entity_chunks_storage is not None:
+        stored_chunks = await entity_chunks_storage.get_by_id(entity_name)
+        if stored_chunks and isinstance(stored_chunks, dict):
+            existing_full_source_ids = [
+                chunk_id for chunk_id in stored_chunks.get("chunk_ids", []) if chunk_id
+            ]
+
+    if not existing_full_source_ids:
+        existing_full_source_ids = [
+            chunk_id for chunk_id in already_source_ids if chunk_id
+        ]
+
+    full_source_ids = merge_source_ids(existing_full_source_ids, new_source_ids)
+
+    if entity_chunks_storage is not None and full_source_ids:
+        await entity_chunks_storage.upsert(
+            {
+                entity_name: {
+                    "chunk_ids": full_source_ids,
+                    "count": len(full_source_ids),
+                }
+            }
+        )
+
+    limit_method = (
+        global_config.get("source_ids_limit_method") or SOURCE_IDS_LIMIT_METHOD_KEEP
+    )
+    source_ids = apply_source_ids_limit(
+        full_source_ids,
+        global_config["max_source_ids_per_entity"],
+        limit_method,
+        identifier=f"`{entity_name}`",
+    )
+
+    # Only apply filtering in IGNORE_NEW mode
+    if limit_method == SOURCE_IDS_LIMIT_METHOD_KEEP:
+        allowed_source_ids = set(source_ids)
+        filtered_nodes = []
+        for dp in nodes_data:
+            source_id = dp.get("source_id")
+            # Skip descriptions sourced from chunks dropped by the IGNORE_NEW cap
+            if (
+                source_id
+                and source_id not in allowed_source_ids
+                and source_id not in existing_full_source_ids
+            ):
+                continue
+            filtered_nodes.append(dp)
+        nodes_data = filtered_nodes
+    else:
+        # In FIFO mode, keep all node descriptions - truncation happens at source_ids level only
+        nodes_data = list(nodes_data)
+
+    max_source_limit = global_config["max_source_ids_per_entity"]
+    skip_summary_due_to_limit = (
+        limit_method == SOURCE_IDS_LIMIT_METHOD_KEEP
+        and len(existing_full_source_ids) >= max_source_limit
+        and not nodes_data
+        and already_description
+    )
+
     # Deduplicate by description, keeping first occurrence
     unique_nodes = {}
     for dp in nodes_data:
-        desc = dp["description"]
+        desc = dp.get("description")
+        if not desc:
+            continue
         if desc not in unique_nodes:
             unique_nodes[desc] = dp
 
@@ -1332,17 +1520,31 @@ async def _merge_nodes_then_upsert(
     )
     sorted_descriptions = [dp["description"] for dp in sorted_nodes]
 
+    truncation_info = ""
+    dd_message = ""
+
     # Combine already_description with sorted new sorted descriptions
     description_list = already_description + sorted_descriptions
+    deduplicated_num = original_nodes_count - len(sorted_descriptions)
+    if deduplicated_num > 0:
+        dd_message = f"dd:{deduplicated_num}"
 
     num_fragment = len(description_list)
     already_fragment = len(already_description)
-    deduplicated_num = already_fragment + len(nodes_data) - num_fragment
-    if deduplicated_num > 0:
-        dd_message = f"(dd:{deduplicated_num})"
-    else:
-        dd_message = ""
-    if num_fragment > 0:
+    if skip_summary_due_to_limit:
+        description = (
+            already_node.get("description", "(no description)")
+            if already_node
+            else "(no description)"
+        )
+        llm_was_used = False
+        status_message = f"Skip merge for `{entity_name}`: IGNORE_NEW limit reached"
+        logger.debug(status_message)
+        if pipeline_status is not None and pipeline_status_lock is not None:
+            async with pipeline_status_lock:
+                pipeline_status["latest_message"] = status_message
+                pipeline_status["history_messages"].append(status_message)
+    elif num_fragment > 0:
         # Get summary and LLM usage status
         description, llm_was_used = await _handle_entity_relation_summary(
             "Entity",
@@ -1355,9 +1557,16 @@ async def _merge_nodes_then_upsert(
 
         # Log based on actual LLM usage
         if llm_was_used:
-            status_message = f"LLMmrg: `{entity_name}` | {already_fragment}+{num_fragment - already_fragment}{dd_message}"
+            status_message = f"LLMmrg: `{entity_name}` | {already_fragment}+{num_fragment - already_fragment}"
         else:
-            status_message = f"Merged: `{entity_name}` | {already_fragment}+{num_fragment - already_fragment}{dd_message}"
+            status_message = f"Merged: `{entity_name}` | {already_fragment}+{num_fragment - already_fragment}"
+
+        # Add truncation info from apply_source_ids_limit if truncation occurred
+        if len(source_ids) < len(full_source_ids):
+            truncation_info = f"{limit_method}:{len(source_ids)}/{len(full_source_ids)}"
+
+        if dd_message or truncation_info:
+            status_message += f"({','.join([truncation_info, dd_message])})"
 
         if already_fragment > 0 or llm_was_used:
             logger.info(status_message)
@@ -1372,9 +1581,6 @@ async def _merge_nodes_then_upsert(
         logger.error(f"Entity {entity_name} has no description")
         description = "(no description)"
 
-    merged_source_ids: set = set([dp["source_id"] for dp in nodes_data] + already_source_ids)
-
-    source_ids = truncate_entity_source_id(merged_source_ids, entity_name, global_config)
     source_id = GRAPH_FIELD_SEP.join(source_ids)
 
     file_path = build_file_path(already_file_paths, nodes_data, entity_name)
@@ -1386,6 +1592,7 @@ async def _merge_nodes_then_upsert(
         source_id=source_id,
         file_path=file_path,
         created_at=int(time.time()),
+        truncate=truncation_info,
     )
     await knowledge_graph_inst.upsert_node(
         entity_name,
@@ -1405,6 +1612,7 @@ async def _merge_edges_then_upsert(
     pipeline_status_lock=None,
     llm_response_cache: BaseKVStorage | None = None,
     added_entities: list = None,  # New parameter to track entities added during edge processing
+    relation_chunks_storage: BaseKVStorage | None = None,
 ):
     if src_id == tgt_id:
         return None
@@ -1448,16 +1656,84 @@ async def _merge_edges_then_upsert(
                     )
                 )
 
+    original_edges_count = len(edges_data)
+
+    new_source_ids = [dp["source_id"] for dp in edges_data if dp.get("source_id")]
+
+    storage_key = make_relation_chunk_key(src_id, tgt_id)
+    existing_full_source_ids = []
+    if relation_chunks_storage is not None:
+        stored_chunks = await relation_chunks_storage.get_by_id(storage_key)
+        if stored_chunks and isinstance(stored_chunks, dict):
+            existing_full_source_ids = [
+                chunk_id for chunk_id in stored_chunks.get("chunk_ids", []) if chunk_id
+            ]
+
+    if not existing_full_source_ids:
+        existing_full_source_ids = [
+            chunk_id for chunk_id in already_source_ids if chunk_id
+        ]
+
+    full_source_ids = merge_source_ids(existing_full_source_ids, new_source_ids)
+
+    if relation_chunks_storage is not None and full_source_ids:
+        await relation_chunks_storage.upsert(
+            {
+                storage_key: {
+                    "chunk_ids": full_source_ids,
+                    "count": len(full_source_ids),
+                }
+            }
+        )
+
+    source_ids = apply_source_ids_limit(
+        full_source_ids,
+        global_config["max_source_ids_per_relation"],
+        global_config.get("source_ids_limit_method"),
+        identifier=f"`{src_id}`~`{tgt_id}`",
+    )
+    limit_method = (
+        global_config.get("source_ids_limit_method") or SOURCE_IDS_LIMIT_METHOD_KEEP
+    )
+
+    # Only apply filtering in IGNORE_NEW mode
+    if limit_method == SOURCE_IDS_LIMIT_METHOD_KEEP:
+        allowed_source_ids = set(source_ids)
+        filtered_edges = []
+        for dp in edges_data:
+            source_id = dp.get("source_id")
+            # Skip relationship fragments sourced from chunks dropped by the IGNORE_NEW cap
+            if (
+                source_id
+                and source_id not in allowed_source_ids
+                and source_id not in existing_full_source_ids
+            ):
+                continue
+            filtered_edges.append(dp)
+        edges_data = filtered_edges
+    else:
+        # In FIFO mode, keep all edge descriptions - truncation happens at source_ids level only
+        edges_data = list(edges_data)
+
+    max_source_limit = global_config["max_source_ids_per_relation"]
+    skip_summary_due_to_limit = (
+        limit_method == SOURCE_IDS_LIMIT_METHOD_KEEP
+        and len(existing_full_source_ids) >= max_source_limit
+        and not edges_data
+        and already_description
+    )
+
     # Process edges_data with None checks
     weight = sum([dp["weight"] for dp in edges_data] + already_weights)
 
     # Deduplicate by description, keeping first occurrence
     unique_edges = {}
     for dp in edges_data:
-        if dp.get("description"):
-            desc = dp["description"]
-            if desc not in unique_edges:
-                unique_edges[desc] = dp
+        description_value = dp.get("description")
+        if not description_value:
+            continue
+        if description_value not in unique_edges:
+            unique_edges[description_value] = dp
 
     # Sort description by timestamp, then by description length (largest to smallest) when timestamps are the same
     sorted_edges = sorted(
@@ -1466,17 +1742,34 @@ async def _merge_edges_then_upsert(
     )
     sorted_descriptions = [dp["description"] for dp in sorted_edges]
 
+    truncation_info = ""
+    dd_message = ""
+
     # Combine already_description with sorted new descriptions
     description_list = already_description + sorted_descriptions
+    deduplicated_num = original_edges_count - len(sorted_descriptions)
+    if deduplicated_num > 0:
+        dd_message = f"dd:{deduplicated_num}"
 
     num_fragment = len(description_list)
     already_fragment = len(already_description)
-    deduplicated_num = already_fragment + len(edges_data) - num_fragment
-    if deduplicated_num > 0:
-        dd_message = f"(dd:{deduplicated_num})"
-    else:
-        dd_message = ""
-    if num_fragment > 0:
+
+    if skip_summary_due_to_limit:
+        description = (
+            already_edge.get("description", "(no description)")
+            if already_edge
+            else "(no description)"
+        )
+        llm_was_used = False
+        status_message = (
+            f"Skip merge for `{src_id}`~`{tgt_id}`: IGNORE_NEW limit reached"
+        )
+        logger.debug(status_message)
+        if pipeline_status is not None and pipeline_status_lock is not None:
+            async with pipeline_status_lock:
+                pipeline_status["latest_message"] = status_message
+                pipeline_status["history_messages"].append(status_message)
+    elif num_fragment > 0:
         # Get summary and LLM usage status
         description, llm_was_used = await _handle_entity_relation_summary(
             "Relation",
@@ -1489,9 +1782,16 @@ async def _merge_edges_then_upsert(
 
         # Log based on actual LLM usage
         if llm_was_used:
-            status_message = f"LLMmrg: `{src_id}`~`{tgt_id}` | {already_fragment}+{num_fragment - already_fragment}{dd_message}"
+            status_message = f"LLMmrg: `{src_id}`~`{tgt_id}` | {already_fragment}+{num_fragment - already_fragment}"
         else:
-            status_message = f"Merged: `{src_id}`~`{tgt_id}` | {already_fragment}+{num_fragment - already_fragment}{dd_message}"
+            status_message = f"Merged: `{src_id}`~`{tgt_id}` | {already_fragment}+{num_fragment - already_fragment}"
+
+        # Add truncation info from apply_source_ids_limit if truncation occurred
+        if len(source_ids) < len(full_source_ids):
+            truncation_info = f"{limit_method}:{len(source_ids)}/{len(full_source_ids)}"
+
+        if dd_message or truncation_info:
+            status_message += f"({','.join([truncation_info, dd_message])})"
 
         if already_fragment > 0 or llm_was_used:
             logger.info(status_message)
@@ -1521,12 +1821,7 @@ async def _merge_edges_then_upsert(
     # Join all unique keywords with commas
     keywords = ",".join(sorted(all_keywords))
 
-    source_id = GRAPH_FIELD_SEP.join(
-        set(
-            [dp["source_id"] for dp in edges_data if dp.get("source_id")]
-            + already_source_ids
-        )
-    )
+    source_id = GRAPH_FIELD_SEP.join(source_ids)
     file_path = build_file_path(already_file_paths, edges_data, f"{src_id}-{tgt_id}")
 
     for need_insert_id in [src_id, tgt_id]:
@@ -1538,6 +1833,7 @@ async def _merge_edges_then_upsert(
                 "entity_type": "UNKNOWN",
                 "file_path": file_path,
                 "created_at": int(time.time()),
+                "truncate": "",
             }
             await knowledge_graph_inst.upsert_node(need_insert_id, node_data=node_data)
 
@@ -1563,6 +1859,7 @@ async def _merge_edges_then_upsert(
             source_id=source_id,
             file_path=file_path,
             created_at=int(time.time()),
+            truncate=truncation_info,
         ),
     )
 
@@ -1574,6 +1871,7 @@ async def _merge_edges_then_upsert(
         source_id=source_id,
         file_path=file_path,
         created_at=int(time.time()),
+        truncate=truncation_info,
     )
 
     return edge_data
@@ -1591,6 +1889,8 @@ async def merge_nodes_and_edges(
     pipeline_status: dict = None,
     pipeline_status_lock=None,
     llm_response_cache: BaseKVStorage | None = None,
+    entity_chunks_storage: BaseKVStorage | None = None,
+    relation_chunks_storage: BaseKVStorage | None = None,
     current_file_number: int = 0,
     total_files: int = 0,
     file_path: str = "unknown_source",
@@ -1614,6 +1914,8 @@ async def merge_nodes_and_edges(
         pipeline_status: Pipeline status dictionary
         pipeline_status_lock: Lock for pipeline status
         llm_response_cache: LLM response cache
+        entity_chunks_storage: Storage tracking full chunk lists per entity
+        relation_chunks_storage: Storage tracking full chunk lists per relation
         current_file_number: Current file number for logging
         total_files: Total files for logging
         file_path: File path for logging
@@ -1671,6 +1973,7 @@ async def merge_nodes_and_edges(
                         pipeline_status,
                         pipeline_status_lock,
                         llm_response_cache,
+                        entity_chunks_storage,
                     )
 
                     # Vector database operation (equally critical, must succeed)
@@ -1689,7 +1992,6 @@ async def merge_nodes_and_edges(
                             }
                         }
 
-
                         logger.debug(f"Inserting {entity_name} in Graph")
                         # Use safe operation wrapper - VDB failure must throw exception
                         await safe_vdb_operation_with_exception(
@@ -1804,6 +2106,7 @@ async def merge_nodes_and_edges(
                         pipeline_status_lock,
                         llm_response_cache,
                         added_entities,  # Pass list to collect added entities
+                        relation_chunks_storage,
                     )
 
                     if edge_data is None:
diff --git a/lightrag/utils.py b/lightrag/utils.py
index 959607e5..6805227e 100644
--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@@ -15,7 +15,17 @@ from dataclasses import dataclass
 from datetime import datetime
 from functools import wraps
 from hashlib import md5
-from typing import Any, Protocol, Callable, TYPE_CHECKING, List, Optional
+from typing import (
+    Any,
+    Protocol,
+    Callable,
+    TYPE_CHECKING,
+    List,
+    Optional,
+    Iterable,
+    Sequence,
+    Collection,
+)
 import numpy as np
 from dotenv import load_dotenv
 
@@ -26,6 +36,9 @@ from lightrag.constants import (
     GRAPH_FIELD_SEP,
     DEFAULT_MAX_TOTAL_TOKENS,
     DEFAULT_MAX_FILE_PATH_LENGTH,
+    DEFAULT_SOURCE_IDS_LIMIT_METHOD,
+    VALID_SOURCE_IDS_LIMIT_METHODS,
+    SOURCE_IDS_LIMIT_METHOD_FIFO,
 )
 
 # Initialize logger with basic configuration
@@ -2464,24 +2477,112 @@ async def process_chunks_unified(
 
     return final_chunks
 
-def truncate_entity_source_id(chunk_ids: set, entity_name: str, global_config: dict) -> set:
-    """Limit chunk_ids, for entities that appear a HUGE no of times (To not break VDB hard upper limits)"""
-    already_len: int = len(chunk_ids)
 
-    max_chunk_ids_per_entity = global_config["max_source_ids_per_entity"]
+def normalize_source_ids_limit_method(method: str | None) -> str:
+    """Normalize the source ID limiting strategy and fall back to default when invalid."""
 
-    if already_len <= max_chunk_ids_per_entity:
-        return chunk_ids
+    if not method:
+        return DEFAULT_SOURCE_IDS_LIMIT_METHOD
 
-    logger.warning(
-        f"Source Ids already exceeds {max_chunk_ids_per_entity } for {entity_name}, "
-        f"current size: {already_len}, truncating..."
-    )
+    normalized = method.upper()
+    if normalized not in VALID_SOURCE_IDS_LIMIT_METHODS:
+        logger.warning(
+            "Unknown SOURCE_IDS_LIMIT_METHOD '%s', falling back to %s",
+            method,
+            DEFAULT_SOURCE_IDS_LIMIT_METHOD,
+        )
+        return DEFAULT_SOURCE_IDS_LIMIT_METHOD
 
-    truncated_chunk_ids = set(list(chunk_ids)[0:max_chunk_ids_per_entity ])
-    return truncated_chunk_ids
+    return normalized
 
 
+def merge_source_ids(
+    existing_ids: Iterable[str] | None, new_ids: Iterable[str] | None
+) -> list[str]:
+    """Merge two iterables of source IDs while preserving order and removing duplicates."""
+
+    merged: list[str] = []
+    seen: set[str] = set()
+
+    for sequence in (existing_ids, new_ids):
+        if not sequence:
+            continue
+        for source_id in sequence:
+            if not source_id:
+                continue
+            if source_id not in seen:
+                seen.add(source_id)
+                merged.append(source_id)
+
+    return merged
+
+
+def apply_source_ids_limit(
+    source_ids: Sequence[str],
+    limit: int,
+    method: str,
+    *,
+    identifier: str | None = None,
+) -> list[str]:
+    """Apply a limit strategy to a sequence of source IDs."""
+
+    if limit <= 0:
+        return []
+
+    source_ids_list = list(source_ids)
+    if len(source_ids_list) <= limit:
+        return source_ids_list
+
+    normalized_method = normalize_source_ids_limit_method(method)
+
+    if normalized_method == SOURCE_IDS_LIMIT_METHOD_FIFO:
+        truncated = source_ids_list[-limit:]
+    else:  # IGNORE_NEW
+        truncated = source_ids_list[:limit]
+
+    if identifier and len(truncated) < len(source_ids_list):
+        logger.debug(
+            "Source_id truncated: %s | %s keeping %s of %s entries",
+            identifier,
+            normalized_method,
+            len(truncated),
+            len(source_ids_list),
+        )
+
+    return truncated
+
+
+def subtract_source_ids(
+    source_ids: Iterable[str],
+    ids_to_remove: Collection[str],
+) -> list[str]:
+    """Remove a collection of IDs from an ordered iterable while preserving order."""
+
+    removal_set = set(ids_to_remove)
+    if not removal_set:
+        return [source_id for source_id in source_ids if source_id]
+
+    return [
+        source_id
+        for source_id in source_ids
+        if source_id and source_id not in removal_set
+    ]
+
+
+def make_relation_chunk_key(src: str, tgt: str) -> str:
+    """Create a deterministic storage key for relation chunk tracking."""
+
+    return GRAPH_FIELD_SEP.join(sorted((src, tgt)))
+
+
+def parse_relation_chunk_key(key: str) -> tuple[str, str]:
+    """Parse a relation chunk storage key back into its entity pair."""
+
+    parts = key.split(GRAPH_FIELD_SEP)
+    if len(parts) != 2:
+        raise ValueError(f"Invalid relation chunk key: {key}")
+    return parts[0], parts[1]
+
 
 def build_file_path(already_file_paths, data_list, target):
     """Build file path string with UTF-8 byte length limit and deduplication

From a9fec26798042c44e98f3700d84aa81f4acd90b3 Mon Sep 17 00:00:00 2001
From: yangdx <gzdaniel@me.com>
Date: Mon, 20 Oct 2025 20:12:53 +0800
Subject: [PATCH 08/25] Add file path limit configuration for entities and
 relations
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

• Add MAX_FILE_PATHS env variable
• Implement file path count limiting
• Support KEEP/FIFO strategies
• Add truncation placeholder
• Remove old build_file_path function
---
 env.example           |   3 +
 lightrag/constants.py |  28 +++---
 lightrag/lightrag.py  |  10 ++
 lightrag/operate.py   | 213 ++++++++++++++++++++++++++++++++++++++----
 lightrag/utils.py     |  60 ------------
 5 files changed, 224 insertions(+), 90 deletions(-)

diff --git a/env.example b/env.example
index 6d53c390..3529cf58 100644
--- a/env.example
+++ b/env.example
@@ -73,11 +73,14 @@ ENABLE_LLM_CACHE=true
 # MAX_RELATION_TOKENS=8000
 ### control the maximum tokens send to LLM (include entities, relations and chunks)
 # MAX_TOTAL_TOKENS=30000
+
 ### control the maximum chunk_ids stored in vector and graph db
 # MAX_SOURCE_IDS_PER_ENTITY=300
 # MAX_SOURCE_IDS_PER_RELATION=300
 ### control chunk_ids limitation method: KEEP, FIFO (KEPP: Ingore New Chunks, FIFO: New chunks replace old chunks)
 # SOURCE_IDS_LIMIT_METHOD=KEEP
+### Maximum number of file paths stored in entity/relation file_path field
+# MAX_FILE_PATHS=30
 
 ### maximum number of related chunks per source entity or relation
 ###     The chunk picker uses this value to determine the total number of chunks selected from KG(knowledge graph)
diff --git a/lightrag/constants.py b/lightrag/constants.py
index e374a991..62ca1888 100644
--- a/lightrag/constants.py
+++ b/lightrag/constants.py
@@ -14,16 +14,6 @@ DEFAULT_MAX_GRAPH_NODES = 1000
 DEFAULT_SUMMARY_LANGUAGE = "English"  # Default language for document processing
 DEFAULT_MAX_GLEANING = 1
 
-DEFAULT_MAX_SOURCE_IDS_PER_ENTITY = 3
-DEFAULT_MAX_SOURCE_IDS_PER_RELATION = 3
-SOURCE_IDS_LIMIT_METHOD_KEEP = "KEEP"
-SOURCE_IDS_LIMIT_METHOD_FIFO = "FIFO"
-DEFAULT_SOURCE_IDS_LIMIT_METHOD = SOURCE_IDS_LIMIT_METHOD_KEEP
-VALID_SOURCE_IDS_LIMIT_METHODS = {
-    SOURCE_IDS_LIMIT_METHOD_KEEP,
-    SOURCE_IDS_LIMIT_METHOD_FIFO,
-}
-
 # Number of description fragments to trigger LLM summary
 DEFAULT_FORCE_LLM_SUMMARY_ON_MERGE = 8
 # Max description token size to trigger LLM summary
@@ -67,8 +57,24 @@ DEFAULT_HISTORY_TURNS = 0
 DEFAULT_MIN_RERANK_SCORE = 0.0
 DEFAULT_RERANK_BINDING = "null"
 
-# File path configuration for vector and graph database(Should not be changed, used in Milvus Schema)
+# Default source ids limit in meta data for entity and relation
+DEFAULT_MAX_SOURCE_IDS_PER_ENTITY = 3
+DEFAULT_MAX_SOURCE_IDS_PER_RELATION = 3
+SOURCE_IDS_LIMIT_METHOD_KEEP = "KEEP"
+SOURCE_IDS_LIMIT_METHOD_FIFO = "FIFO"
+DEFAULT_SOURCE_IDS_LIMIT_METHOD = SOURCE_IDS_LIMIT_METHOD_KEEP
+VALID_SOURCE_IDS_LIMIT_METHODS = {
+    SOURCE_IDS_LIMIT_METHOD_KEEP,
+    SOURCE_IDS_LIMIT_METHOD_FIFO,
+}
+# Default file_path limit in meta data for entity and relation
+DEFAULT_MAX_FILE_PATHS = 2
+
+# Field length of file_path in Milvus Schema for entity and relation (Should not be changed)
+# file_path must store all file paths up to the DEFAULT_MAX_FILE_PATHS limit within the metadata.
 DEFAULT_MAX_FILE_PATH_LENGTH = 32768
+# Placeholder for more file paths in meta data for entity and relation (Should not be changed)
+DEFAULT_FILE_PATH_MORE_PLACEHOLDER = "truncated"
 
 # Default temperature for LLM
 DEFAULT_TEMPERATURE = 1.0
diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py
index 1f32da50..4380a276 100644
--- a/lightrag/lightrag.py
+++ b/lightrag/lightrag.py
@@ -47,6 +47,8 @@ from lightrag.constants import (
     DEFAULT_LLM_TIMEOUT,
     DEFAULT_EMBEDDING_TIMEOUT,
     DEFAULT_SOURCE_IDS_LIMIT_METHOD,
+    DEFAULT_MAX_FILE_PATHS,
+    DEFAULT_FILE_PATH_MORE_PLACEHOLDER,
 )
 from lightrag.utils import get_env_value
 
@@ -393,6 +395,14 @@ class LightRAG:
     )
     """Strategy for enforcing source_id limits: IGNORE_NEW or FIFO."""
 
+    max_file_paths: int = field(
+        default=get_env_value("MAX_FILE_PATHS", DEFAULT_MAX_FILE_PATHS, int)
+    )
+    """Maximum number of file paths to store in entity/relation file_path field."""
+
+    file_path_more_placeholder: str = field(default=DEFAULT_FILE_PATH_MORE_PLACEHOLDER)
+    """Placeholder text when file paths exceed max_file_paths limit."""
+
     addon_params: dict[str, Any] = field(
         default_factory=lambda: {
             "language": get_env_value(
diff --git a/lightrag/operate.py b/lightrag/operate.py
index 2f7f6340..6b409f21 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -26,7 +26,6 @@ from lightrag.utils import (
     pick_by_weighted_polling,
     pick_by_vector_similarity,
     process_chunks_unified,
-    build_file_path,
     safe_vdb_operation_with_exception,
     create_prefixed_exception,
     fix_tuple_delimiter_corruption,
@@ -56,6 +55,8 @@ from lightrag.constants import (
     DEFAULT_ENTITY_TYPES,
     DEFAULT_SUMMARY_LANGUAGE,
     SOURCE_IDS_LIMIT_METHOD_KEEP,
+    SOURCE_IDS_LIMIT_METHOD_FIFO,
+    DEFAULT_FILE_PATH_MORE_PLACEHOLDER,
 )
 from lightrag.kg.shared_storage import get_storage_keyed_lock
 import time
@@ -1156,7 +1157,8 @@ async def _rebuild_single_entity(
     # Process cached entity data
     descriptions = []
     entity_types = []
-    file_paths = set()
+    file_paths_list = []
+    seen_paths = set()
 
     for entity_data in all_entity_data:
         if entity_data.get("description"):
@@ -1164,7 +1166,35 @@ async def _rebuild_single_entity(
         if entity_data.get("entity_type"):
             entity_types.append(entity_data["entity_type"])
         if entity_data.get("file_path"):
-            file_paths.add(entity_data["file_path"])
+            file_path = entity_data["file_path"]
+            if file_path and file_path not in seen_paths:
+                file_paths_list.append(file_path)
+                seen_paths.add(file_path)
+
+    # Apply MAX_FILE_PATHS limit
+    max_file_paths = global_config.get("max_file_paths")
+    file_path_placeholder = global_config.get(
+        "file_path_more_placeholder", DEFAULT_FILE_PATH_MORE_PLACEHOLDER
+    )
+    limit_method = global_config.get("source_ids_limit_method")
+
+    original_count = len(file_paths_list)
+    if original_count > max_file_paths:
+        if limit_method == SOURCE_IDS_LIMIT_METHOD_FIFO:
+            # FIFO: keep tail (newest), discard head
+            file_paths_list = file_paths_list[-max_file_paths:]
+        else:
+            # KEEP: keep head (earliest), discard tail
+            file_paths_list = file_paths_list[:max_file_paths]
+
+        file_paths_list.append(
+            f"...{file_path_placeholder}(showing {max_file_paths} of {original_count})..."
+        )
+        logger.info(
+            f"Limited `{entity_name}`: file_path {original_count} -> {max_file_paths} ({limit_method})"
+        )
+
+    file_paths = set(file_paths_list)
 
     # Remove duplicates while preserving order
     description_list = list(dict.fromkeys(descriptions))
@@ -1284,7 +1314,8 @@ async def _rebuild_single_relationship(
     descriptions = []
     keywords = []
     weights = []
-    file_paths = set()
+    file_paths_list = []
+    seen_paths = set()
 
     for rel_data in all_relationship_data:
         if rel_data.get("description"):
@@ -1294,7 +1325,35 @@ async def _rebuild_single_relationship(
         if rel_data.get("weight"):
             weights.append(rel_data["weight"])
         if rel_data.get("file_path"):
-            file_paths.add(rel_data["file_path"])
+            file_path = rel_data["file_path"]
+            if file_path and file_path not in seen_paths:
+                file_paths_list.append(file_path)
+                seen_paths.add(file_path)
+
+    # Apply count limit
+    max_file_paths = global_config.get("max_file_paths")
+    file_path_placeholder = global_config.get(
+        "file_path_more_placeholder", DEFAULT_FILE_PATH_MORE_PLACEHOLDER
+    )
+    limit_method = global_config.get("source_ids_limit_method")
+
+    original_count = len(file_paths_list)
+    if original_count > max_file_paths:
+        if limit_method == SOURCE_IDS_LIMIT_METHOD_FIFO:
+            # FIFO: keep tail (newest), discard head
+            file_paths_list = file_paths_list[-max_file_paths:]
+        else:
+            # KEEP: keep head (earliest), discard tail
+            file_paths_list = file_paths_list[:max_file_paths]
+
+        file_paths_list.append(
+            f"...{file_path_placeholder}(showing {max_file_paths} of {original_count})..."
+        )
+        logger.info(
+            f"Limited `{src}`~`{tgt}`: file_path {original_count} -> {max_file_paths} ({limit_method})"
+        )
+
+    file_paths = set(file_paths_list)
 
     # Remove duplicates while preserving order
     description_list = list(dict.fromkeys(descriptions))
@@ -1467,23 +1526,22 @@ async def _merge_nodes_then_upsert(
             }
         )
 
-    limit_method = (
-        global_config.get("source_ids_limit_method") or SOURCE_IDS_LIMIT_METHOD_KEEP
-    )
+    limit_method = global_config.get("source_ids_limit_method")
+    max_source_limit = global_config.get("max_source_ids_per_entity")
     source_ids = apply_source_ids_limit(
         full_source_ids,
-        global_config["max_source_ids_per_entity"],
+        max_source_limit,
         limit_method,
         identifier=f"`{entity_name}`",
     )
 
-    # Only apply filtering in IGNORE_NEW mode
+    # Only apply filtering in KEEP(ignore new) mode
     if limit_method == SOURCE_IDS_LIMIT_METHOD_KEEP:
         allowed_source_ids = set(source_ids)
         filtered_nodes = []
         for dp in nodes_data:
             source_id = dp.get("source_id")
-            # Skip descriptions sourced from chunks dropped by the IGNORE_NEW cap
+            # Skip descriptions sourced from chunks dropped by the limitation cap
             if (
                 source_id
                 and source_id not in allowed_source_ids
@@ -1496,7 +1554,6 @@ async def _merge_nodes_then_upsert(
         # In FIFO mode, keep all node descriptions - truncation happens at source_ids level only
         nodes_data = list(nodes_data)
 
-    max_source_limit = global_config["max_source_ids_per_entity"]
     skip_summary_due_to_limit = (
         limit_method == SOURCE_IDS_LIMIT_METHOD_KEEP
         and len(existing_full_source_ids) >= max_source_limit
@@ -1566,7 +1623,7 @@ async def _merge_nodes_then_upsert(
             truncation_info = f"{limit_method}:{len(source_ids)}/{len(full_source_ids)}"
 
         if dd_message or truncation_info:
-            status_message += f"({','.join([truncation_info, dd_message])})"
+            status_message += f" ({', '.join([truncation_info, dd_message])})"
 
         if already_fragment > 0 or llm_was_used:
             logger.info(status_message)
@@ -1583,7 +1640,65 @@ async def _merge_nodes_then_upsert(
 
     source_id = GRAPH_FIELD_SEP.join(source_ids)
 
-    file_path = build_file_path(already_file_paths, nodes_data, entity_name)
+    # Build file_path with count limit
+    if skip_summary_due_to_limit:
+        # Skip limit, keep original file_path
+        file_path = GRAPH_FIELD_SEP.join(fp for fp in already_file_paths if fp)
+    else:
+        # Collect and apply limit
+        file_paths_list = []
+        seen_paths = set()
+
+        # Get placeholder to filter it out
+        file_path_placeholder = global_config.get(
+            "file_path_more_placeholder", DEFAULT_FILE_PATH_MORE_PLACEHOLDER
+        )
+
+        # Collect from already_file_paths, excluding placeholder
+        for fp in already_file_paths:
+            # Skip placeholders (format: "...{placeholder}(showing X of Y)...")
+            if (
+                fp
+                and not fp.startswith(f"...{file_path_placeholder}")
+                and fp not in seen_paths
+            ):
+                file_paths_list.append(fp)
+                seen_paths.add(fp)
+
+        # Collect from new data
+        for dp in nodes_data:
+            file_path_item = dp.get("file_path")
+            if file_path_item and file_path_item not in seen_paths:
+                file_paths_list.append(file_path_item)
+                seen_paths.add(file_path_item)
+
+        # Apply count limit
+        max_file_paths = global_config.get("max_file_paths")
+
+        if len(file_paths_list) > max_file_paths:
+            limit_method = global_config.get(
+                "source_ids_limit_method", SOURCE_IDS_LIMIT_METHOD_KEEP
+            )
+            file_path_placeholder = global_config.get(
+                "file_path_more_placeholder", DEFAULT_FILE_PATH_MORE_PLACEHOLDER
+            )
+            original_count = len(file_paths_list)
+
+            if limit_method == SOURCE_IDS_LIMIT_METHOD_FIFO:
+                # FIFO: keep tail (newest), discard head
+                file_paths_list = file_paths_list[-max_file_paths:]
+            else:
+                # KEEP: keep head (earliest), discard tail
+                file_paths_list = file_paths_list[:max_file_paths]
+
+            file_paths_list.append(
+                f"...{file_path_placeholder}(showing {max_file_paths} of {original_count})..."
+            )
+            logger.info(
+                f"Limited `{entity_name}`: file_path {original_count} -> {max_file_paths} ({limit_method})"
+            )
+
+        file_path = GRAPH_FIELD_SEP.join(file_paths_list)
 
     node_data = dict(
         entity_id=entity_name,
@@ -1686,10 +1801,12 @@ async def _merge_edges_then_upsert(
             }
         )
 
+    limit_method = global_config.get("source_ids_limit_method")
+    max_source_limit = global_config.get("max_source_ids_per_relation")
     source_ids = apply_source_ids_limit(
         full_source_ids,
-        global_config["max_source_ids_per_relation"],
-        global_config.get("source_ids_limit_method"),
+        max_source_limit,
+        limit_method,
         identifier=f"`{src_id}`~`{tgt_id}`",
     )
     limit_method = (
@@ -1715,7 +1832,6 @@ async def _merge_edges_then_upsert(
         # In FIFO mode, keep all edge descriptions - truncation happens at source_ids level only
         edges_data = list(edges_data)
 
-    max_source_limit = global_config["max_source_ids_per_relation"]
     skip_summary_due_to_limit = (
         limit_method == SOURCE_IDS_LIMIT_METHOD_KEEP
         and len(existing_full_source_ids) >= max_source_limit
@@ -1791,7 +1907,7 @@ async def _merge_edges_then_upsert(
             truncation_info = f"{limit_method}:{len(source_ids)}/{len(full_source_ids)}"
 
         if dd_message or truncation_info:
-            status_message += f"({','.join([truncation_info, dd_message])})"
+            status_message += f" ({', '.join([truncation_info, dd_message])})"
 
         if already_fragment > 0 or llm_was_used:
             logger.info(status_message)
@@ -1822,7 +1938,66 @@ async def _merge_edges_then_upsert(
     keywords = ",".join(sorted(all_keywords))
 
     source_id = GRAPH_FIELD_SEP.join(source_ids)
-    file_path = build_file_path(already_file_paths, edges_data, f"{src_id}-{tgt_id}")
+
+    # Build file_path with count limit
+    if skip_summary_due_to_limit:
+        # Skip limit, keep original file_path
+        file_path = GRAPH_FIELD_SEP.join(fp for fp in already_file_paths if fp)
+    else:
+        # Collect and apply limit
+        file_paths_list = []
+        seen_paths = set()
+
+        # Get placeholder to filter it out
+        file_path_placeholder = global_config.get(
+            "file_path_more_placeholder", DEFAULT_FILE_PATH_MORE_PLACEHOLDER
+        )
+
+        # Collect from already_file_paths, excluding placeholder
+        for fp in already_file_paths:
+            # Skip placeholders (format: "...{placeholder}(showing X of Y)...")
+            if (
+                fp
+                and not fp.startswith(f"...{file_path_placeholder}")
+                and fp not in seen_paths
+            ):
+                file_paths_list.append(fp)
+                seen_paths.add(fp)
+
+        # Collect from new data
+        for dp in edges_data:
+            file_path_item = dp.get("file_path")
+            if file_path_item and file_path_item not in seen_paths:
+                file_paths_list.append(file_path_item)
+                seen_paths.add(file_path_item)
+
+        # Apply count limit
+        max_file_paths = global_config.get("max_file_paths")
+
+        if len(file_paths_list) > max_file_paths:
+            limit_method = global_config.get(
+                "source_ids_limit_method", SOURCE_IDS_LIMIT_METHOD_KEEP
+            )
+            file_path_placeholder = global_config.get(
+                "file_path_more_placeholder", DEFAULT_FILE_PATH_MORE_PLACEHOLDER
+            )
+            original_count = len(file_paths_list)
+
+            if limit_method == SOURCE_IDS_LIMIT_METHOD_FIFO:
+                # FIFO: keep tail (newest), discard head
+                file_paths_list = file_paths_list[-max_file_paths:]
+            else:
+                # KEEP: keep head (earliest), discard tail
+                file_paths_list = file_paths_list[:max_file_paths]
+
+            file_paths_list.append(
+                f"...{file_path_placeholder}(showing {max_file_paths} of {original_count})..."
+            )
+            logger.info(
+                f"Limited `{src_id}`~`{tgt_id}`: file_path {original_count} -> {max_file_paths} ({limit_method})"
+            )
+
+        file_path = GRAPH_FIELD_SEP.join(file_paths_list)
 
     for need_insert_id in [src_id, tgt_id]:
         if not (await knowledge_graph_inst.has_node(need_insert_id)):
diff --git a/lightrag/utils.py b/lightrag/utils.py
index 6805227e..bfa3cac4 100644
--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@@ -35,7 +35,6 @@ from lightrag.constants import (
     DEFAULT_LOG_FILENAME,
     GRAPH_FIELD_SEP,
     DEFAULT_MAX_TOTAL_TOKENS,
-    DEFAULT_MAX_FILE_PATH_LENGTH,
     DEFAULT_SOURCE_IDS_LIMIT_METHOD,
     VALID_SOURCE_IDS_LIMIT_METHODS,
     SOURCE_IDS_LIMIT_METHOD_FIFO,
@@ -2584,65 +2583,6 @@ def parse_relation_chunk_key(key: str) -> tuple[str, str]:
     return parts[0], parts[1]
 
 
-def build_file_path(already_file_paths, data_list, target):
-    """Build file path string with UTF-8 byte length limit and deduplication
-
-    Args:
-        already_file_paths: List of existing file paths
-        data_list: List of data items containing file_path
-        target: Target name for logging warnings
-
-    Returns:
-        str: Combined file paths separated by GRAPH_FIELD_SEP
-    """
-    # set: deduplication
-    file_paths_set = {fp for fp in already_file_paths if fp}
-
-    # string: filter empty value and keep file order in already_file_paths
-    file_paths = GRAPH_FIELD_SEP.join(fp for fp in already_file_paths if fp)
-
-    # Check if initial file_paths already exceeds byte length limit
-    if len(file_paths.encode("utf-8")) >= DEFAULT_MAX_FILE_PATH_LENGTH:
-        logger.warning(
-            f"Initial file_paths already exceeds {DEFAULT_MAX_FILE_PATH_LENGTH} bytes for {target}, "
-            f"current size: {len(file_paths.encode('utf-8'))} bytes"
-        )
-
-    # ignored file_paths
-    file_paths_ignore = ""
-    # add file_paths
-    for dp in data_list:
-        cur_file_path = dp.get("file_path")
-        # empty
-        if not cur_file_path:
-            continue
-
-        # skip duplicate item
-        if cur_file_path in file_paths_set:
-            continue
-        # add
-        file_paths_set.add(cur_file_path)
-
-        # check the UTF-8 byte length
-        new_addition = GRAPH_FIELD_SEP + cur_file_path if file_paths else cur_file_path
-        if (
-            len(file_paths.encode("utf-8")) + len(new_addition.encode("utf-8"))
-            < DEFAULT_MAX_FILE_PATH_LENGTH - 5
-        ):
-            # append
-            file_paths += new_addition
-        else:
-            # ignore
-            file_paths_ignore += GRAPH_FIELD_SEP + cur_file_path
-
-    if file_paths_ignore:
-        logger.warning(
-            f"File paths exceed {DEFAULT_MAX_FILE_PATH_LENGTH} bytes for {target}, "
-            f"ignoring file path: {file_paths_ignore}"
-        )
-    return file_paths
-
-
 def generate_track_id(prefix: str = "upload") -> str:
     """Generate a unique tracking ID with timestamp and UUID
 

From e0fd31a60d5e346b4cd9566d114789fab915fcbd Mon Sep 17 00:00:00 2001
From: yangdx <gzdaniel@me.com>
Date: Mon, 20 Oct 2025 22:09:09 +0800
Subject: [PATCH 09/25] Fix logging message formatting

---
 env.example           | 26 +++++++++++++-------------
 lightrag/constants.py |  6 +++---
 lightrag/operate.py   | 16 ++++++++++------
 3 files changed, 26 insertions(+), 22 deletions(-)

diff --git a/env.example b/env.example
index 3529cf58..73f2d7b7 100644
--- a/env.example
+++ b/env.example
@@ -74,19 +74,6 @@ ENABLE_LLM_CACHE=true
 ### control the maximum tokens send to LLM (include entities, relations and chunks)
 # MAX_TOTAL_TOKENS=30000
 
-### control the maximum chunk_ids stored in vector and graph db
-# MAX_SOURCE_IDS_PER_ENTITY=300
-# MAX_SOURCE_IDS_PER_RELATION=300
-### control chunk_ids limitation method: KEEP, FIFO (KEPP: Ingore New Chunks, FIFO: New chunks replace old chunks)
-# SOURCE_IDS_LIMIT_METHOD=KEEP
-### Maximum number of file paths stored in entity/relation file_path field
-# MAX_FILE_PATHS=30
-
-### maximum number of related chunks per source entity or relation
-###     The chunk picker uses this value to determine the total number of chunks selected from KG(knowledge graph)
-###     Higher values increase re-ranking time
-# RELATED_CHUNK_NUMBER=5
-
 ### chunk selection strategies
 ###     VECTOR: Pick KG chunks by vector similarity, delivered chunks to the LLM aligning more closely with naive retrieval
 ###     WEIGHT: Pick KG chunks by entity and chunk weight, delivered more solely KG related chunks to the LLM
@@ -148,6 +135,19 @@ SUMMARY_LANGUAGE=English
 ### Maximum context size sent to LLM for description summary
 # SUMMARY_CONTEXT_SIZE=12000
 
+### control the maximum chunk_ids stored in vector and graph db
+# MAX_SOURCE_IDS_PER_ENTITY=300
+# MAX_SOURCE_IDS_PER_RELATION=300
+### control chunk_ids limitation method: KEEP, FIFO (KEEP: Keep oldest, FIFO: First in first out)
+# SOURCE_IDS_LIMIT_METHOD=KEEP
+### Maximum number of file paths stored in entity/relation file_path field
+# MAX_FILE_PATHS=30
+
+### maximum number of related chunks per source entity or relation
+###     The chunk picker uses this value to determine the total number of chunks selected from KG(knowledge graph)
+###     Higher values increase re-ranking time
+# RELATED_CHUNK_NUMBER=5
+
 ###############################
 ### Concurrency Configuration
 ###############################
diff --git a/lightrag/constants.py b/lightrag/constants.py
index 62ca1888..ad12cccf 100644
--- a/lightrag/constants.py
+++ b/lightrag/constants.py
@@ -60,14 +60,14 @@ DEFAULT_RERANK_BINDING = "null"
 # Default source ids limit in meta data for entity and relation
 DEFAULT_MAX_SOURCE_IDS_PER_ENTITY = 3
 DEFAULT_MAX_SOURCE_IDS_PER_RELATION = 3
-SOURCE_IDS_LIMIT_METHOD_KEEP = "KEEP"
-SOURCE_IDS_LIMIT_METHOD_FIFO = "FIFO"
+SOURCE_IDS_LIMIT_METHOD_KEEP = "KEEP"  # Keep oldest
+SOURCE_IDS_LIMIT_METHOD_FIFO = "FIFO"  # First In First Out (Keep newest)
 DEFAULT_SOURCE_IDS_LIMIT_METHOD = SOURCE_IDS_LIMIT_METHOD_KEEP
 VALID_SOURCE_IDS_LIMIT_METHODS = {
     SOURCE_IDS_LIMIT_METHOD_KEEP,
     SOURCE_IDS_LIMIT_METHOD_FIFO,
 }
-# Default file_path limit in meta data for entity and relation
+# Default file_path limit in meta data for entity and relation (Use same limit method as source_ids)
 DEFAULT_MAX_FILE_PATHS = 2
 
 # Field length of file_path in Milvus Schema for entity and relation (Should not be changed)
diff --git a/lightrag/operate.py b/lightrag/operate.py
index 6b409f21..3e889eb7 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -1188,7 +1188,7 @@ async def _rebuild_single_entity(
             file_paths_list = file_paths_list[:max_file_paths]
 
         file_paths_list.append(
-            f"...{file_path_placeholder}(showing {max_file_paths} of {original_count})..."
+            f"...{file_path_placeholder}({limit_method} {max_file_paths}/{original_count})..."
         )
         logger.info(
             f"Limited `{entity_name}`: file_path {original_count} -> {max_file_paths} ({limit_method})"
@@ -1347,7 +1347,7 @@ async def _rebuild_single_relationship(
             file_paths_list = file_paths_list[:max_file_paths]
 
         file_paths_list.append(
-            f"...{file_path_placeholder}(showing {max_file_paths} of {original_count})..."
+            f"...{file_path_placeholder}({limit_method} {max_file_paths}/{original_count})..."
         )
         logger.info(
             f"Limited `{src}`~`{tgt}`: file_path {original_count} -> {max_file_paths} ({limit_method})"
@@ -1623,7 +1623,9 @@ async def _merge_nodes_then_upsert(
             truncation_info = f"{limit_method}:{len(source_ids)}/{len(full_source_ids)}"
 
         if dd_message or truncation_info:
-            status_message += f" ({', '.join([truncation_info, dd_message])})"
+            status_message += (
+                f" ({', '.join(filter(None, [truncation_info, dd_message]))})"
+            )
 
         if already_fragment > 0 or llm_was_used:
             logger.info(status_message)
@@ -1692,7 +1694,7 @@ async def _merge_nodes_then_upsert(
                 file_paths_list = file_paths_list[:max_file_paths]
 
             file_paths_list.append(
-                f"...{file_path_placeholder}(showing {max_file_paths} of {original_count})..."
+                f"...{file_path_placeholder}({limit_method} {max_file_paths}/{original_count})..."
             )
             logger.info(
                 f"Limited `{entity_name}`: file_path {original_count} -> {max_file_paths} ({limit_method})"
@@ -1907,7 +1909,9 @@ async def _merge_edges_then_upsert(
             truncation_info = f"{limit_method}:{len(source_ids)}/{len(full_source_ids)}"
 
         if dd_message or truncation_info:
-            status_message += f" ({', '.join([truncation_info, dd_message])})"
+            status_message += (
+                f" ({', '.join(filter(None, [truncation_info, dd_message]))})"
+            )
 
         if already_fragment > 0 or llm_was_used:
             logger.info(status_message)
@@ -1991,7 +1995,7 @@ async def _merge_edges_then_upsert(
                 file_paths_list = file_paths_list[:max_file_paths]
 
             file_paths_list.append(
-                f"...{file_path_placeholder}(showing {max_file_paths} of {original_count})..."
+                f"...{file_path_placeholder}({limit_method} {max_file_paths}/{original_count})..."
             )
             logger.info(
                 f"Limited `{src_id}`~`{tgt_id}`: file_path {original_count} -> {max_file_paths} ({limit_method})"

From 637b850ec55593b67566f3514f4d37edc84979d1 Mon Sep 17 00:00:00 2001
From: yangdx <gzdaniel@me.com>
Date: Mon, 20 Oct 2025 23:03:01 +0800
Subject: [PATCH 10/25] Add truncation indicator and update property labels in
 graph view
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

• Add truncate tooltip to source_id field
• Add visual truncation indicator (†)
• Bump API version to 0242
---
 lightrag/api/__init__.py                      |  2 +-
 .../src/components/graph/PropertiesView.tsx   | 22 ++++++++++++++-----
 lightrag_webui/src/locales/ar.json            |  6 ++---
 lightrag_webui/src/locales/en.json            |  4 ++--
 lightrag_webui/src/locales/fr.json            |  4 ++--
 lightrag_webui/src/locales/zh.json            |  4 ++--
 lightrag_webui/src/locales/zh_TW.json         |  4 ++--
 7 files changed, 29 insertions(+), 17 deletions(-)

diff --git a/lightrag/api/__init__.py b/lightrag/api/__init__.py
index 822818a6..e1baefb9 100644
--- a/lightrag/api/__init__.py
+++ b/lightrag/api/__init__.py
@@ -1 +1 @@
-__api_version__ = "0241"
+__api_version__ = "0242"
diff --git a/lightrag_webui/src/components/graph/PropertiesView.tsx b/lightrag_webui/src/components/graph/PropertiesView.tsx
index 3ebdfd29..b46eb5b4 100644
--- a/lightrag_webui/src/components/graph/PropertiesView.tsx
+++ b/lightrag_webui/src/components/graph/PropertiesView.tsx
@@ -183,7 +183,8 @@ const PropertyRow = ({
   entityType,
   sourceId,
   targetId,
-  isEditable = false
+  isEditable = false,
+  truncate
 }: {
   name: string
   value: any
@@ -197,6 +198,7 @@ const PropertyRow = ({
   sourceId?: string
   targetId?: string
   isEditable?: boolean
+  truncate?: string
 }) => {
   const { t } = useTranslation()
 
@@ -216,7 +218,12 @@ const PropertyRow = ({
 
   // Format the value to convert <SEP> to newlines
   const formattedValue = formatValueWithSeparators(value)
-  const formattedTooltip = tooltip || formatValueWithSeparators(value)
+  let formattedTooltip = tooltip || formatValueWithSeparators(value)
+
+  // If this is source_id field and truncate info exists, append it to the tooltip
+  if (name === 'source_id' && truncate) {
+    formattedTooltip += `\n(${truncate} truncated)`
+  }
 
   // Use EditablePropertyRow for editable fields (description, entity_id and keywords)
   if (isEditable && (name === 'description' || name === 'entity_id' || name === 'keywords')) {
@@ -241,7 +248,10 @@ const PropertyRow = ({
   // For non-editable fields, use the regular Text component
   return (
     <div className="flex items-center gap-2">
-      <span className="text-primary/60 tracking-wide whitespace-nowrap">{getPropertyNameTranslation(name)}</span>:
+      <span className="text-primary/60 tracking-wide whitespace-nowrap">
+        {getPropertyNameTranslation(name)}
+        {name === 'source_id' && truncate && <sup className="text-red-500">†</sup>}
+      </span>:
       <Text
         className="hover:bg-primary/20 rounded p-1 overflow-hidden text-ellipsis"
         tooltipClassName="max-w-96 -translate-x-13"
@@ -306,7 +316,7 @@ const NodePropertiesView = ({ node }: { node: NodeType }) => {
         {Object.keys(node.properties)
           .sort()
           .map((name) => {
-            if (name === 'created_at') return null; // Hide created_at property
+            if (name === 'created_at' || name === 'truncate') return null; // Hide created_at and truncate properties
             return (
               <PropertyRow
                 key={name}
@@ -316,6 +326,7 @@ const NodePropertiesView = ({ node }: { node: NodeType }) => {
                 entityId={node.properties['entity_id']}
                 entityType="node"
                 isEditable={name === 'description' || name === 'entity_id'}
+                truncate={node.properties['truncate']}
               />
             )
           })}
@@ -373,7 +384,7 @@ const EdgePropertiesView = ({ edge }: { edge: EdgeType }) => {
         {Object.keys(edge.properties)
           .sort()
           .map((name) => {
-            if (name === 'created_at') return null; // Hide created_at property
+            if (name === 'created_at' || name === 'truncate') return null; // Hide created_at and truncate properties
             return (
               <PropertyRow
                 key={name}
@@ -385,6 +396,7 @@ const EdgePropertiesView = ({ edge }: { edge: EdgeType }) => {
                 sourceId={edge.sourceNode?.properties['entity_id'] || edge.source}
                 targetId={edge.targetNode?.properties['entity_id'] || edge.target}
                 isEditable={name === 'description' || name === 'keywords'}
+                truncate={edge.properties['truncate']}
               />
             )
           })}
diff --git a/lightrag_webui/src/locales/ar.json b/lightrag_webui/src/locales/ar.json
index fb5e84bb..6f2703ca 100644
--- a/lightrag_webui/src/locales/ar.json
+++ b/lightrag_webui/src/locales/ar.json
@@ -318,10 +318,10 @@
           "description": "الوصف",
           "entity_id": "الاسم",
           "entity_type": "النوع",
-          "source_id": "معرف المصدر",
+          "source_id": "C-ID",
           "Neighbour": "الجار",
-          "file_path": "المصدر",
-          "keywords": "الكلمات الرئيسية",
+          "file_path": "File",
+          "keywords": "Keyword",
           "weight": "الوزن"
         }
       },
diff --git a/lightrag_webui/src/locales/en.json b/lightrag_webui/src/locales/en.json
index 3f4d04a9..418ac296 100644
--- a/lightrag_webui/src/locales/en.json
+++ b/lightrag_webui/src/locales/en.json
@@ -318,9 +318,9 @@
           "description": "Description",
           "entity_id": "Name",
           "entity_type": "Type",
-          "source_id": "SrcID",
+          "source_id": "C-ID",
           "Neighbour": "Neigh",
-          "file_path": "Source",
+          "file_path": "File",
           "keywords": "Keys",
           "weight": "Weight"
         }
diff --git a/lightrag_webui/src/locales/fr.json b/lightrag_webui/src/locales/fr.json
index 9104d34c..463f05eb 100644
--- a/lightrag_webui/src/locales/fr.json
+++ b/lightrag_webui/src/locales/fr.json
@@ -318,9 +318,9 @@
           "description": "Description",
           "entity_id": "Nom",
           "entity_type": "Type",
-          "source_id": "ID source",
+          "source_id": "C-ID",
           "Neighbour": "Voisin",
-          "file_path": "Source",
+          "file_path": "File",
           "keywords": "Keys",
           "weight": "Poids"
         }
diff --git a/lightrag_webui/src/locales/zh.json b/lightrag_webui/src/locales/zh.json
index e6f87e18..40d8cdb0 100644
--- a/lightrag_webui/src/locales/zh.json
+++ b/lightrag_webui/src/locales/zh.json
@@ -318,9 +318,9 @@
           "description": "描述",
           "entity_id": "名称",
           "entity_type": "类型",
-          "source_id": "信源ID",
+          "source_id": "C-ID",
           "Neighbour": "邻接",
-          "file_path": "信源",
+          "file_path": "文件",
           "keywords": "Keys",
           "weight": "权重"
         }
diff --git a/lightrag_webui/src/locales/zh_TW.json b/lightrag_webui/src/locales/zh_TW.json
index 003ce313..5ea179c2 100644
--- a/lightrag_webui/src/locales/zh_TW.json
+++ b/lightrag_webui/src/locales/zh_TW.json
@@ -318,9 +318,9 @@
           "description": "描述",
           "entity_id": "名稱",
           "entity_type": "類型",
-          "source_id": "來源ID",
+          "source_id": "C-ID",
           "Neighbour": "鄰接",
-          "file_path": "來源",
+          "file_path": "檔案",
           "keywords": "Keys",
           "weight": "權重"
         }

From e01c998ee92d930a689fa8655227d4a632d2615b Mon Sep 17 00:00:00 2001
From: yangdx <gzdaniel@me.com>
Date: Mon, 20 Oct 2025 23:48:04 +0800
Subject: [PATCH 11/25] Track placeholders in file paths for accurate source
 count display
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

• Add has_placeholder tracking variable
• Detect placeholder patterns in paths
• Show + sign for truncated counts
---
 lightrag/operate.py | 44 ++++++++++++++++++++++++++++++++------------
 1 file changed, 32 insertions(+), 12 deletions(-)

diff --git a/lightrag/operate.py b/lightrag/operate.py
index 3e889eb7..a5e168be 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -1579,6 +1579,7 @@ async def _merge_nodes_then_upsert(
 
     truncation_info = ""
     dd_message = ""
+    has_placeholder = False  # Initialize to track placeholder in file paths
 
     # Combine already_description with sorted new sorted descriptions
     description_list = already_description + sorted_descriptions
@@ -1620,7 +1621,15 @@ async def _merge_nodes_then_upsert(
 
         # Add truncation info from apply_source_ids_limit if truncation occurred
         if len(source_ids) < len(full_source_ids):
-            truncation_info = f"{limit_method}:{len(source_ids)}/{len(full_source_ids)}"
+            # Add + sign if has_placeholder is True, indicating actual file count is higher
+            full_source_count_str = (
+                f"{len(full_source_ids)}+"
+                if has_placeholder
+                else str(len(full_source_ids))
+            )
+            truncation_info = (
+                f"{limit_method}:{len(source_ids)}/{full_source_count_str}"
+            )
 
         if dd_message or truncation_info:
             status_message += (
@@ -1650,6 +1659,7 @@ async def _merge_nodes_then_upsert(
         # Collect and apply limit
         file_paths_list = []
         seen_paths = set()
+        has_placeholder = False  # Track if already_file_paths contains placeholder
 
         # Get placeholder to filter it out
         file_path_placeholder = global_config.get(
@@ -1658,12 +1668,12 @@ async def _merge_nodes_then_upsert(
 
         # Collect from already_file_paths, excluding placeholder
         for fp in already_file_paths:
+            # Check if this is a placeholder record
+            if fp and fp.startswith(f"...{file_path_placeholder}"):
+                has_placeholder = True
+                continue
             # Skip placeholders (format: "...{placeholder}(showing X of Y)...")
-            if (
-                fp
-                and not fp.startswith(f"...{file_path_placeholder}")
-                and fp not in seen_paths
-            ):
+            if fp and fp not in seen_paths:
                 file_paths_list.append(fp)
                 seen_paths.add(fp)
 
@@ -1862,6 +1872,7 @@ async def _merge_edges_then_upsert(
 
     truncation_info = ""
     dd_message = ""
+    has_placeholder = False  # Initialize to track placeholder in file paths
 
     # Combine already_description with sorted new descriptions
     description_list = already_description + sorted_descriptions
@@ -1906,7 +1917,15 @@ async def _merge_edges_then_upsert(
 
         # Add truncation info from apply_source_ids_limit if truncation occurred
         if len(source_ids) < len(full_source_ids):
-            truncation_info = f"{limit_method}:{len(source_ids)}/{len(full_source_ids)}"
+            # Add + sign if has_placeholder is True, indicating actual file count is higher
+            full_source_count_str = (
+                f"{len(full_source_ids)}+"
+                if has_placeholder
+                else str(len(full_source_ids))
+            )
+            truncation_info = (
+                f"{limit_method}:{len(source_ids)}/{full_source_count_str}"
+            )
 
         if dd_message or truncation_info:
             status_message += (
@@ -1951,6 +1970,7 @@ async def _merge_edges_then_upsert(
         # Collect and apply limit
         file_paths_list = []
         seen_paths = set()
+        has_placeholder = False  # Track if already_file_paths contains placeholder
 
         # Get placeholder to filter it out
         file_path_placeholder = global_config.get(
@@ -1959,12 +1979,12 @@ async def _merge_edges_then_upsert(
 
         # Collect from already_file_paths, excluding placeholder
         for fp in already_file_paths:
+            # Check if this is a placeholder record
+            if fp and fp.startswith(f"...{file_path_placeholder}"):
+                has_placeholder = True
+                continue
             # Skip placeholders (format: "...{placeholder}(showing X of Y)...")
-            if (
-                fp
-                and not fp.startswith(f"...{file_path_placeholder}")
-                and fp not in seen_paths
-            ):
+            if fp and fp not in seen_paths:
                 file_paths_list.append(fp)
                 seen_paths.add(fp)
 

From 665f60b90f4611c180914ffe296b9cd51b3969ff Mon Sep 17 00:00:00 2001
From: yangdx <gzdaniel@me.com>
Date: Tue, 21 Oct 2025 03:19:34 +0800
Subject: [PATCH 12/25] Refactor entity/relation merge to consolidate VDB
 operations within functions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

• Move VDB upserts into merge functions
• Fix early return data structure issues
• Update status messages (IGNORE_NEW → KEEP)
• Consolidate error handling paths
• Improve relationship content format
---
 lightrag/operate.py | 214 ++++++++++++++++++++++++--------------------
 1 file changed, 117 insertions(+), 97 deletions(-)

diff --git a/lightrag/operate.py b/lightrag/operate.py
index a5e168be..290f19b6 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -1470,6 +1470,7 @@ async def _merge_nodes_then_upsert(
     entity_name: str,
     nodes_data: list[dict],
     knowledge_graph_inst: BaseGraphStorage,
+    entity_vdb: BaseVectorStorage | None,
     global_config: dict,
     pipeline_status: dict = None,
     pipeline_status_lock=None,
@@ -1595,13 +1596,25 @@ async def _merge_nodes_then_upsert(
             if already_node
             else "(no description)"
         )
-        llm_was_used = False
-        status_message = f"Skip merge for `{entity_name}`: IGNORE_NEW limit reached"
+        status_message = f"Skip merge for `{entity_name}`: KEEP limit reached"
         logger.debug(status_message)
         if pipeline_status is not None and pipeline_status_lock is not None:
             async with pipeline_status_lock:
                 pipeline_status["latest_message"] = status_message
                 pipeline_status["history_messages"].append(status_message)
+        existing_node_data = dict(already_node or {})
+        if not existing_node_data:
+            existing_node_data = {
+                "entity_id": entity_name,
+                "entity_type": entity_type,
+                "description": description,
+                "source_id": GRAPH_FIELD_SEP.join(existing_full_source_ids),
+                "file_path": GRAPH_FIELD_SEP.join(already_file_paths),
+                "created_at": int(time.time()),
+                "truncate": "",
+            }
+        existing_node_data["entity_name"] = entity_name
+        return existing_node_data
     elif num_fragment > 0:
         # Get summary and LLM usage status
         description, llm_was_used = await _handle_entity_relation_summary(
@@ -1726,6 +1739,25 @@ async def _merge_nodes_then_upsert(
         node_data=node_data,
     )
     node_data["entity_name"] = entity_name
+    if entity_vdb is not None:
+        entity_vdb_id = compute_mdhash_id(str(entity_name), prefix="ent-")
+        entity_content = f"{entity_name}\n{description}"
+        data_for_vdb = {
+            entity_vdb_id: {
+                "entity_name": entity_name,
+                "entity_type": entity_type,
+                "content": entity_content,
+                "source_id": source_id,
+                "file_path": file_path,
+            }
+        }
+        await safe_vdb_operation_with_exception(
+            operation=lambda payload=data_for_vdb: entity_vdb.upsert(payload),
+            operation_name="entity_upsert",
+            entity_name=entity_name,
+            max_retries=3,
+            retry_delay=0.1,
+        )
     return node_data
 
 
@@ -1734,6 +1766,8 @@ async def _merge_edges_then_upsert(
     tgt_id: str,
     edges_data: list[dict],
     knowledge_graph_inst: BaseGraphStorage,
+    relationships_vdb: BaseVectorStorage | None,
+    entity_vdb: BaseVectorStorage | None,
     global_config: dict,
     pipeline_status: dict = None,
     pipeline_status_lock=None,
@@ -1744,6 +1778,7 @@ async def _merge_edges_then_upsert(
     if src_id == tgt_id:
         return None
 
+    already_edge = None
     already_weights = []
     already_source_ids = []
     already_description = []
@@ -1825,13 +1860,13 @@ async def _merge_edges_then_upsert(
         global_config.get("source_ids_limit_method") or SOURCE_IDS_LIMIT_METHOD_KEEP
     )
 
-    # Only apply filtering in IGNORE_NEW mode
+    # Only apply filtering in KEEP(ignore new) mode
     if limit_method == SOURCE_IDS_LIMIT_METHOD_KEEP:
         allowed_source_ids = set(source_ids)
         filtered_edges = []
         for dp in edges_data:
             source_id = dp.get("source_id")
-            # Skip relationship fragments sourced from chunks dropped by the IGNORE_NEW cap
+            # Skip relationship fragments sourced from chunks dropped by keep oldest cap
             if (
                 source_id
                 and source_id not in allowed_source_ids
@@ -1889,15 +1924,29 @@ async def _merge_edges_then_upsert(
             if already_edge
             else "(no description)"
         )
-        llm_was_used = False
         status_message = (
-            f"Skip merge for `{src_id}`~`{tgt_id}`: IGNORE_NEW limit reached"
+            f"Skip merge for `{src_id}`~`{tgt_id}`: KEEP limit reached"
         )
         logger.debug(status_message)
         if pipeline_status is not None and pipeline_status_lock is not None:
             async with pipeline_status_lock:
                 pipeline_status["latest_message"] = status_message
                 pipeline_status["history_messages"].append(status_message)
+        existing_edge_data = dict(already_edge or {})
+        if not existing_edge_data:
+            existing_edge_data = {
+                "description": description,
+                "keywords": GRAPH_FIELD_SEP.join(already_keywords),
+                "source_id": GRAPH_FIELD_SEP.join(existing_full_source_ids),
+                "file_path": GRAPH_FIELD_SEP.join(already_file_paths),
+                "weight": sum(already_weights) if already_weights else 0.0,
+                "truncate": "",
+                "created_at": int(time.time()),
+            }
+        existing_edge_data.setdefault("created_at", int(time.time()))
+        existing_edge_data["src_id"] = src_id
+        existing_edge_data["tgt_id"] = tgt_id
+        return existing_edge_data
     elif num_fragment > 0:
         # Get summary and LLM usage status
         description, llm_was_used = await _handle_entity_relation_summary(
@@ -2025,17 +2074,38 @@ async def _merge_edges_then_upsert(
 
     for need_insert_id in [src_id, tgt_id]:
         if not (await knowledge_graph_inst.has_node(need_insert_id)):
+            node_created_at = int(time.time())
             node_data = {
                 "entity_id": need_insert_id,
                 "source_id": source_id,
                 "description": description,
                 "entity_type": "UNKNOWN",
                 "file_path": file_path,
-                "created_at": int(time.time()),
+                "created_at": node_created_at,
                 "truncate": "",
             }
             await knowledge_graph_inst.upsert_node(need_insert_id, node_data=node_data)
 
+            if entity_vdb is not None:
+                entity_vdb_id = compute_mdhash_id(need_insert_id, prefix="ent-")
+                entity_content = f"{need_insert_id}\n{description}"
+                vdb_data = {
+                    entity_vdb_id: {
+                        "content": entity_content,
+                        "entity_name": need_insert_id,
+                        "source_id": source_id,
+                        "entity_type": "UNKNOWN",
+                        "file_path": file_path,
+                    }
+                }
+                await safe_vdb_operation_with_exception(
+                    operation=lambda payload=vdb_data: entity_vdb.upsert(payload),
+                    operation_name="added_entity_upsert",
+                    entity_name=need_insert_id,
+                    max_retries=3,
+                    retry_delay=0.1,
+                )
+
             # Track entities added during edge processing
             if added_entities is not None:
                 entity_data = {
@@ -2044,10 +2114,11 @@ async def _merge_edges_then_upsert(
                     "description": description,
                     "source_id": source_id,
                     "file_path": file_path,
-                    "created_at": int(time.time()),
+                    "created_at": node_created_at,
                 }
                 added_entities.append(entity_data)
 
+    edge_created_at = int(time.time())
     await knowledge_graph_inst.upsert_edge(
         src_id,
         tgt_id,
@@ -2057,7 +2128,7 @@ async def _merge_edges_then_upsert(
             keywords=keywords,
             source_id=source_id,
             file_path=file_path,
-            created_at=int(time.time()),
+            created_at=edge_created_at,
             truncate=truncation_info,
         ),
     )
@@ -2069,10 +2140,41 @@ async def _merge_edges_then_upsert(
         keywords=keywords,
         source_id=source_id,
         file_path=file_path,
-        created_at=int(time.time()),
+        created_at=edge_created_at,
         truncate=truncation_info,
+        weight=weight,
     )
 
+    if relationships_vdb is not None:
+        rel_vdb_id = compute_mdhash_id(src_id + tgt_id, prefix="rel-")
+        rel_vdb_id_reverse = compute_mdhash_id(tgt_id + src_id, prefix="rel-")
+        try:
+            await relationships_vdb.delete([rel_vdb_id, rel_vdb_id_reverse])
+        except Exception as e:
+            logger.debug(
+                f"Could not delete old relationship vector records {rel_vdb_id}, {rel_vdb_id_reverse}: {e}"
+            )
+        rel_content = f"{keywords}\t{src_id}\n{tgt_id}\n{description}"
+        vdb_data = {
+            rel_vdb_id: {
+                "src_id": src_id,
+                "tgt_id": tgt_id,
+                "source_id": source_id,
+                "content": rel_content,
+                "keywords": keywords,
+                "description": description,
+                "weight": weight,
+                "file_path": file_path,
+            }
+        }
+        await safe_vdb_operation_with_exception(
+            operation=lambda payload=vdb_data: relationships_vdb.upsert(payload),
+            operation_name="relationship_upsert",
+            entity_name=f"{src_id}-{tgt_id}",
+            max_retries=3,
+            retry_delay=0.2,
+        )
+
     return edge_data
 
 
@@ -2162,12 +2264,12 @@ async def merge_nodes_and_edges(
                 [entity_name], namespace=namespace, enable_logging=False
             ):
                 try:
-                    logger.debug(f"Inserting {entity_name} in Graph")
-                    # Graph database operation (critical path, must succeed)
+                    logger.debug(f"Processing entity {entity_name}")
                     entity_data = await _merge_nodes_then_upsert(
                         entity_name,
                         entities,
                         knowledge_graph_inst,
+                        entity_vdb,
                         global_config,
                         pipeline_status,
                         pipeline_status_lock,
@@ -2175,36 +2277,9 @@ async def merge_nodes_and_edges(
                         entity_chunks_storage,
                     )
 
-                    # Vector database operation (equally critical, must succeed)
-                    if entity_vdb is not None and entity_data:
-                        data_for_vdb = {
-                            compute_mdhash_id(
-                                str(entity_data["entity_name"]), prefix="ent-"
-                            ): {
-                                "entity_name": entity_data["entity_name"],
-                                "entity_type": entity_data["entity_type"],
-                                "content": f"{entity_data['entity_name']}\n{entity_data['description']}",
-                                "source_id": entity_data["source_id"],
-                                "file_path": entity_data.get(
-                                    "file_path", "unknown_source"
-                                ),
-                            }
-                        }
-
-                        logger.debug(f"Inserting {entity_name} in Graph")
-                        # Use safe operation wrapper - VDB failure must throw exception
-                        await safe_vdb_operation_with_exception(
-                            operation=lambda: entity_vdb.upsert(data_for_vdb),
-                            operation_name="entity_upsert",
-                            entity_name=entity_name,
-                            max_retries=3,
-                            retry_delay=0.1,
-                        )
-
                     return entity_data
 
                 except Exception as e:
-                    # Any database operation failure is critical
                     error_msg = (
                         f"Critical error in entity processing for `{entity_name}`: {e}"
                     )
@@ -2294,12 +2369,14 @@ async def merge_nodes_and_edges(
                 try:
                     added_entities = []  # Track entities added during edge processing
 
-                    # Graph database operation (critical path, must succeed)
+                    logger.debug(f"Processing relation {sorted_edge_key}")
                     edge_data = await _merge_edges_then_upsert(
                         edge_key[0],
                         edge_key[1],
                         edges,
                         knowledge_graph_inst,
+                        relationships_vdb,
+                        entity_vdb,
                         global_config,
                         pipeline_status,
                         pipeline_status_lock,
@@ -2311,66 +2388,9 @@ async def merge_nodes_and_edges(
                     if edge_data is None:
                         return None, []
 
-                    # Vector database operation (equally critical, must succeed)
-                    if relationships_vdb is not None:
-                        data_for_vdb = {
-                            compute_mdhash_id(
-                                edge_data["src_id"] + edge_data["tgt_id"], prefix="rel-"
-                            ): {
-                                "src_id": edge_data["src_id"],
-                                "tgt_id": edge_data["tgt_id"],
-                                "keywords": edge_data["keywords"],
-                                "content": f"{edge_data['src_id']}\t{edge_data['tgt_id']}\n{edge_data['keywords']}\n{edge_data['description']}",
-                                "source_id": edge_data["source_id"],
-                                "file_path": edge_data.get(
-                                    "file_path", "unknown_source"
-                                ),
-                                "weight": edge_data.get("weight", 1.0),
-                            }
-                        }
-
-                        # Use safe operation wrapper - VDB failure must throw exception
-                        await safe_vdb_operation_with_exception(
-                            operation=lambda: relationships_vdb.upsert(data_for_vdb),
-                            operation_name="relationship_upsert",
-                            entity_name=f"{edge_data['src_id']}-{edge_data['tgt_id']}",
-                            max_retries=3,
-                            retry_delay=0.1,
-                        )
-
-                    # Update added_entities to entity vector database using safe operation wrapper
-                    if added_entities and entity_vdb is not None:
-                        for entity_data in added_entities:
-                            entity_vdb_id = compute_mdhash_id(
-                                entity_data["entity_name"], prefix="ent-"
-                            )
-                            entity_content = f"{entity_data['entity_name']}\n{entity_data['description']}"
-
-                            vdb_data = {
-                                entity_vdb_id: {
-                                    "content": entity_content,
-                                    "entity_name": entity_data["entity_name"],
-                                    "source_id": entity_data["source_id"],
-                                    "entity_type": entity_data["entity_type"],
-                                    "file_path": entity_data.get(
-                                        "file_path", "unknown_source"
-                                    ),
-                                }
-                            }
-
-                            # Use safe operation wrapper - VDB failure must throw exception
-                            await safe_vdb_operation_with_exception(
-                                operation=lambda data=vdb_data: entity_vdb.upsert(data),
-                                operation_name="added_entity_upsert",
-                                entity_name=entity_data["entity_name"],
-                                max_retries=3,
-                                retry_delay=0.1,
-                            )
-
                     return edge_data, added_entities
 
                 except Exception as e:
-                    # Any database operation failure is critical
                     error_msg = f"Critical error in relationship processing for `{sorted_edge_key}`: {e}"
                     logger.error(error_msg)
 

From 1154c5683fd3b9b3c378b0dea3f1791bcf7b09cf Mon Sep 17 00:00:00 2001
From: yangdx <gzdaniel@me.com>
Date: Tue, 21 Oct 2025 04:41:15 +0800
Subject: [PATCH 13/25] Refactor deduplication calculation and remove unused
 variables

---
 lightrag/operate.py | 20 +++++++-------------
 1 file changed, 7 insertions(+), 13 deletions(-)

diff --git a/lightrag/operate.py b/lightrag/operate.py
index 290f19b6..325bea25 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -1498,8 +1498,6 @@ async def _merge_nodes_then_upsert(
         reverse=True,
     )[0][0]  # Get the entity type with the highest count
 
-    original_nodes_count = len(nodes_data)
-
     new_source_ids = [dp["source_id"] for dp in nodes_data if dp.get("source_id")]
 
     existing_full_source_ids = []
@@ -1584,12 +1582,12 @@ async def _merge_nodes_then_upsert(
 
     # Combine already_description with sorted new sorted descriptions
     description_list = already_description + sorted_descriptions
-    deduplicated_num = original_nodes_count - len(sorted_descriptions)
+    num_fragment = len(description_list)
+    already_fragment = len(already_description)
+    deduplicated_num = already_fragment + len(nodes_data) - num_fragment
     if deduplicated_num > 0:
         dd_message = f"dd:{deduplicated_num}"
 
-    num_fragment = len(description_list)
-    already_fragment = len(already_description)
     if skip_summary_due_to_limit:
         description = (
             already_node.get("description", "(no description)")
@@ -1818,8 +1816,6 @@ async def _merge_edges_then_upsert(
                     )
                 )
 
-    original_edges_count = len(edges_data)
-
     new_source_ids = [dp["source_id"] for dp in edges_data if dp.get("source_id")]
 
     storage_key = make_relation_chunk_key(src_id, tgt_id)
@@ -1911,12 +1907,12 @@ async def _merge_edges_then_upsert(
 
     # Combine already_description with sorted new descriptions
     description_list = already_description + sorted_descriptions
-    deduplicated_num = original_edges_count - len(sorted_descriptions)
-    if deduplicated_num > 0:
-        dd_message = f"dd:{deduplicated_num}"
 
     num_fragment = len(description_list)
     already_fragment = len(already_description)
+    deduplicated_num = already_fragment + len(edges_data) - num_fragment
+    if deduplicated_num > 0:
+        dd_message = f"dd:{deduplicated_num}"
 
     if skip_summary_due_to_limit:
         description = (
@@ -1924,9 +1920,7 @@ async def _merge_edges_then_upsert(
             if already_edge
             else "(no description)"
         )
-        status_message = (
-            f"Skip merge for `{src_id}`~`{tgt_id}`: KEEP limit reached"
-        )
+        status_message = f"Skip merge for `{src_id}`~`{tgt_id}`: KEEP limit reached"
         logger.debug(status_message)
         if pipeline_status is not None and pipeline_status_lock is not None:
             async with pipeline_status_lock:

From 019dff5248d1d5111e9fbf8791cbc6423ad7c4a7 Mon Sep 17 00:00:00 2001
From: yangdx <gzdaniel@me.com>
Date: Tue, 21 Oct 2025 04:46:07 +0800
Subject: [PATCH 14/25] Update truncation message format in properties tooltip

---
 lightrag_webui/src/components/graph/PropertiesView.tsx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lightrag_webui/src/components/graph/PropertiesView.tsx b/lightrag_webui/src/components/graph/PropertiesView.tsx
index b46eb5b4..39b9a448 100644
--- a/lightrag_webui/src/components/graph/PropertiesView.tsx
+++ b/lightrag_webui/src/components/graph/PropertiesView.tsx
@@ -222,7 +222,7 @@ const PropertyRow = ({
 
   // If this is source_id field and truncate info exists, append it to the tooltip
   if (name === 'source_id' && truncate) {
-    formattedTooltip += `\n(${truncate} truncated)`
+    formattedTooltip += `\n(Truncation-${truncate})`
   }
 
   // Use EditablePropertyRow for editable fields (description, entity_id and keywords)

From cd1c48beaf33250a1abaf3fe3cd793822f413aea Mon Sep 17 00:00:00 2001
From: yangdx <gzdaniel@me.com>
Date: Tue, 21 Oct 2025 05:03:57 +0800
Subject: [PATCH 15/25] Standardize placeholder format to use colon separator
 consistently

---
 lightrag/operate.py | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/lightrag/operate.py b/lightrag/operate.py
index 325bea25..15f18dca 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -1188,7 +1188,7 @@ async def _rebuild_single_entity(
             file_paths_list = file_paths_list[:max_file_paths]
 
         file_paths_list.append(
-            f"...{file_path_placeholder}({limit_method} {max_file_paths}/{original_count})..."
+            f"...{file_path_placeholder}({limit_method}:{max_file_paths}/{original_count})..."
         )
         logger.info(
             f"Limited `{entity_name}`: file_path {original_count} -> {max_file_paths} ({limit_method})"
@@ -1347,7 +1347,7 @@ async def _rebuild_single_relationship(
             file_paths_list = file_paths_list[:max_file_paths]
 
         file_paths_list.append(
-            f"...{file_path_placeholder}({limit_method} {max_file_paths}/{original_count})..."
+            f"...{file_path_placeholder}({limit_method}:{max_file_paths}/{original_count})..."
         )
         logger.info(
             f"Limited `{src}`~`{tgt}`: file_path {original_count} -> {max_file_paths} ({limit_method})"
@@ -1680,10 +1680,9 @@ async def _merge_nodes_then_upsert(
         # Collect from already_file_paths, excluding placeholder
         for fp in already_file_paths:
             # Check if this is a placeholder record
-            if fp and fp.startswith(f"...{file_path_placeholder}"):
+            if fp and fp.startswith(f"...{file_path_placeholder}"):  # Skip placeholders
                 has_placeholder = True
                 continue
-            # Skip placeholders (format: "...{placeholder}(showing X of Y)...")
             if fp and fp not in seen_paths:
                 file_paths_list.append(fp)
                 seen_paths.add(fp)
@@ -1715,7 +1714,7 @@ async def _merge_nodes_then_upsert(
                 file_paths_list = file_paths_list[:max_file_paths]
 
             file_paths_list.append(
-                f"...{file_path_placeholder}({limit_method} {max_file_paths}/{original_count})..."
+                f"...{file_path_placeholder}({limit_method}:{max_file_paths}/{original_count})..."
             )
             logger.info(
                 f"Limited `{entity_name}`: file_path {original_count} -> {max_file_paths} ({limit_method})"
@@ -2023,10 +2022,9 @@ async def _merge_edges_then_upsert(
         # Collect from already_file_paths, excluding placeholder
         for fp in already_file_paths:
             # Check if this is a placeholder record
-            if fp and fp.startswith(f"...{file_path_placeholder}"):
+            if fp and fp.startswith(f"...{file_path_placeholder}"):  # Skip placeholders
                 has_placeholder = True
                 continue
-            # Skip placeholders (format: "...{placeholder}(showing X of Y)...")
             if fp and fp not in seen_paths:
                 file_paths_list.append(fp)
                 seen_paths.add(fp)
@@ -2058,7 +2056,7 @@ async def _merge_edges_then_upsert(
                 file_paths_list = file_paths_list[:max_file_paths]
 
             file_paths_list.append(
-                f"...{file_path_placeholder}({limit_method} {max_file_paths}/{original_count})..."
+                f"...{file_path_placeholder}({limit_method}:{max_file_paths}/{original_count})..."
             )
             logger.info(
                 f"Limited `{src_id}`~`{tgt_id}`: file_path {original_count} -> {max_file_paths} ({limit_method})"

From 1248b3ab0436c0d9aebc96cde611890440b89153 Mon Sep 17 00:00:00 2001
From: yangdx <gzdaniel@me.com>
Date: Tue, 21 Oct 2025 05:30:09 +0800
Subject: [PATCH 16/25] Increase default limits for source IDs and file paths
 in metadata
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

• Entity source IDs: 3 → 300
• Relation source IDs: 3 → 300
• File paths: 2 → 30
---
 lightrag/constants.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/lightrag/constants.py b/lightrag/constants.py
index ad12cccf..7c2b2701 100644
--- a/lightrag/constants.py
+++ b/lightrag/constants.py
@@ -58,8 +58,8 @@ DEFAULT_MIN_RERANK_SCORE = 0.0
 DEFAULT_RERANK_BINDING = "null"
 
 # Default source ids limit in meta data for entity and relation
-DEFAULT_MAX_SOURCE_IDS_PER_ENTITY = 3
-DEFAULT_MAX_SOURCE_IDS_PER_RELATION = 3
+DEFAULT_MAX_SOURCE_IDS_PER_ENTITY = 300
+DEFAULT_MAX_SOURCE_IDS_PER_RELATION = 300
 SOURCE_IDS_LIMIT_METHOD_KEEP = "KEEP"  # Keep oldest
 SOURCE_IDS_LIMIT_METHOD_FIFO = "FIFO"  # First In First Out (Keep newest)
 DEFAULT_SOURCE_IDS_LIMIT_METHOD = SOURCE_IDS_LIMIT_METHOD_KEEP
@@ -68,7 +68,7 @@ VALID_SOURCE_IDS_LIMIT_METHODS = {
     SOURCE_IDS_LIMIT_METHOD_FIFO,
 }
 # Default file_path limit in meta data for entity and relation (Use same limit method as source_ids)
-DEFAULT_MAX_FILE_PATHS = 2
+DEFAULT_MAX_FILE_PATHS = 30
 
 # Field length of file_path in Milvus Schema for entity and relation (Should not be changed)
 # file_path must store all file paths up to the DEFAULT_MAX_FILE_PATHS limit within the metadata.

From a5253244f97d5fc8930ad0cd12270ab557d1f4f3 Mon Sep 17 00:00:00 2001
From: yangdx <gzdaniel@me.com>
Date: Tue, 21 Oct 2025 06:33:34 +0800
Subject: [PATCH 17/25] Simplify skip logging and reduce pipeline status
 updates

---
 lightrag/operate.py | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/lightrag/operate.py b/lightrag/operate.py
index 15f18dca..4b34f474 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -1589,17 +1589,12 @@ async def _merge_nodes_then_upsert(
         dd_message = f"dd:{deduplicated_num}"
 
     if skip_summary_due_to_limit:
+        logger.info(f"Skipped `{entity_name}`: KEEP old chunks")
         description = (
             already_node.get("description", "(no description)")
             if already_node
             else "(no description)"
         )
-        status_message = f"Skip merge for `{entity_name}`: KEEP limit reached"
-        logger.debug(status_message)
-        if pipeline_status is not None and pipeline_status_lock is not None:
-            async with pipeline_status_lock:
-                pipeline_status["latest_message"] = status_message
-                pipeline_status["history_messages"].append(status_message)
         existing_node_data = dict(already_node or {})
         if not existing_node_data:
             existing_node_data = {
@@ -1914,17 +1909,12 @@ async def _merge_edges_then_upsert(
         dd_message = f"dd:{deduplicated_num}"
 
     if skip_summary_due_to_limit:
+        logger.info(f"Skipped `{src_id}`~`{tgt_id}`: KEEP old chunks")
         description = (
             already_edge.get("description", "(no description)")
             if already_edge
             else "(no description)"
         )
-        status_message = f"Skip merge for `{src_id}`~`{tgt_id}`: KEEP limit reached"
-        logger.debug(status_message)
-        if pipeline_status is not None and pipeline_status_lock is not None:
-            async with pipeline_status_lock:
-                pipeline_status["latest_message"] = status_message
-                pipeline_status["history_messages"].append(status_message)
         existing_edge_data = dict(already_edge or {})
         if not existing_edge_data:
             existing_edge_data = {

From be3d274a0b852962d56abd5c1d33bf86c8288700 Mon Sep 17 00:00:00 2001
From: yangdx <gzdaniel@me.com>
Date: Tue, 21 Oct 2025 15:16:47 +0800
Subject: [PATCH 18/25] Refactor node and edge merging logic with improved code
 structure
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

• Add numbered steps for clarity
• Improve early return handling
• Enhance file path limiting logic
---
 lightrag/operate.py | 553 +++++++++++++++++++++-----------------------
 1 file changed, 262 insertions(+), 291 deletions(-)

diff --git a/lightrag/operate.py b/lightrag/operate.py
index 4b34f474..60fa66a3 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -57,6 +57,7 @@ from lightrag.constants import (
     SOURCE_IDS_LIMIT_METHOD_KEEP,
     SOURCE_IDS_LIMIT_METHOD_FIFO,
     DEFAULT_FILE_PATH_MORE_PLACEHOLDER,
+    DEFAULT_MAX_FILE_PATHS,
 )
 from lightrag.kg.shared_storage import get_storage_keyed_lock
 import time
@@ -1483,6 +1484,7 @@ async def _merge_nodes_then_upsert(
     already_description = []
     already_file_paths = []
 
+    # 1. Get existing node data from knowledge graph
     already_node = await knowledge_graph_inst.get_node(entity_name)
     if already_node:
         already_entity_types.append(already_node["entity_type"])
@@ -1490,14 +1492,6 @@ async def _merge_nodes_then_upsert(
         already_file_paths.extend(already_node["file_path"].split(GRAPH_FIELD_SEP))
         already_description.extend(already_node["description"].split(GRAPH_FIELD_SEP))
 
-    entity_type = sorted(
-        Counter(
-            [dp["entity_type"] for dp in nodes_data] + already_entity_types
-        ).items(),
-        key=lambda x: x[1],
-        reverse=True,
-    )[0][0]  # Get the entity type with the highest count
-
     new_source_ids = [dp["source_id"] for dp in nodes_data if dp.get("source_id")]
 
     existing_full_source_ids = []
@@ -1513,6 +1507,7 @@ async def _merge_nodes_then_upsert(
             chunk_id for chunk_id in already_source_ids if chunk_id
         ]
 
+    # 2. Merging new source ids with existing ones
     full_source_ids = merge_source_ids(existing_full_source_ids, new_source_ids)
 
     if entity_chunks_storage is not None and full_source_ids:
@@ -1525,6 +1520,7 @@ async def _merge_nodes_then_upsert(
             }
         )
 
+    # 3. Finalize source_id by applying source ids limit
     limit_method = global_config.get("source_ids_limit_method")
     max_source_limit = global_config.get("max_source_ids_per_entity")
     source_ids = apply_source_ids_limit(
@@ -1534,7 +1530,7 @@ async def _merge_nodes_then_upsert(
         identifier=f"`{entity_name}`",
     )
 
-    # Only apply filtering in KEEP(ignore new) mode
+    # 4. Only keep nodes not filter by apply_source_ids_limit if limit_method is KEEP
     if limit_method == SOURCE_IDS_LIMIT_METHOD_KEEP:
         allowed_source_ids = set(source_ids)
         filtered_nodes = []
@@ -1549,18 +1545,38 @@ async def _merge_nodes_then_upsert(
                 continue
             filtered_nodes.append(dp)
         nodes_data = filtered_nodes
-    else:
-        # In FIFO mode, keep all node descriptions - truncation happens at source_ids level only
+    else:  # In FIFO mode, keep all nodes - truncation happens at source_ids level only
         nodes_data = list(nodes_data)
 
-    skip_summary_due_to_limit = (
+    # 5. Check if we need to skip summary due to source_ids limit
+    if (
         limit_method == SOURCE_IDS_LIMIT_METHOD_KEEP
         and len(existing_full_source_ids) >= max_source_limit
         and not nodes_data
-        and already_description
-    )
+    ):
+        if already_node:
+            logger.info(f"Skipped `{entity_name}`: KEEP old chunks")
+            existing_node_data = dict(already_node)
+            return existing_node_data
+        else:
+            logger.error(f"Internal Error: already_node missing for `{entity_name}`")
+            raise ValueError(
+                f"Internal Error: already_node missing for `{entity_name}`"
+            )
 
-    # Deduplicate by description, keeping first occurrence
+    # 6.1 Finalize source_id
+    source_id = GRAPH_FIELD_SEP.join(source_ids)
+
+    # 6.2 Finalize entity type by highest count
+    entity_type = sorted(
+        Counter(
+            [dp["entity_type"] for dp in nodes_data] + already_entity_types
+        ).items(),
+        key=lambda x: x[1],
+        reverse=True,
+    )[0][0]
+
+    # 7. Deduplicate nodes by description, keeping first occurrence in the same document
     unique_nodes = {}
     for dp in nodes_data:
         desc = dp.get("description")
@@ -1569,154 +1585,121 @@ async def _merge_nodes_then_upsert(
         if desc not in unique_nodes:
             unique_nodes[desc] = dp
 
-    # Sort description by timestamp, then by description length (largest to smallest) when timestamps are the same
+    # Sort description by timestamp, then by description length when timestamps are the same
     sorted_nodes = sorted(
         unique_nodes.values(),
         key=lambda x: (x.get("timestamp", 0), -len(x.get("description", ""))),
     )
     sorted_descriptions = [dp["description"] for dp in sorted_nodes]
 
-    truncation_info = ""
-    dd_message = ""
-    has_placeholder = False  # Initialize to track placeholder in file paths
-
     # Combine already_description with sorted new sorted descriptions
     description_list = already_description + sorted_descriptions
-    num_fragment = len(description_list)
-    already_fragment = len(already_description)
-    deduplicated_num = already_fragment + len(nodes_data) - num_fragment
-    if deduplicated_num > 0:
-        dd_message = f"dd:{deduplicated_num}"
-
-    if skip_summary_due_to_limit:
-        logger.info(f"Skipped `{entity_name}`: KEEP old chunks")
-        description = (
-            already_node.get("description", "(no description)")
-            if already_node
-            else "(no description)"
-        )
-        existing_node_data = dict(already_node or {})
-        if not existing_node_data:
-            existing_node_data = {
-                "entity_id": entity_name,
-                "entity_type": entity_type,
-                "description": description,
-                "source_id": GRAPH_FIELD_SEP.join(existing_full_source_ids),
-                "file_path": GRAPH_FIELD_SEP.join(already_file_paths),
-                "created_at": int(time.time()),
-                "truncate": "",
-            }
-        existing_node_data["entity_name"] = entity_name
-        return existing_node_data
-    elif num_fragment > 0:
-        # Get summary and LLM usage status
-        description, llm_was_used = await _handle_entity_relation_summary(
-            "Entity",
-            entity_name,
-            description_list,
-            GRAPH_FIELD_SEP,
-            global_config,
-            llm_response_cache,
-        )
-
-        # Log based on actual LLM usage
-        if llm_was_used:
-            status_message = f"LLMmrg: `{entity_name}` | {already_fragment}+{num_fragment - already_fragment}"
-        else:
-            status_message = f"Merged: `{entity_name}` | {already_fragment}+{num_fragment - already_fragment}"
-
-        # Add truncation info from apply_source_ids_limit if truncation occurred
-        if len(source_ids) < len(full_source_ids):
-            # Add + sign if has_placeholder is True, indicating actual file count is higher
-            full_source_count_str = (
-                f"{len(full_source_ids)}+"
-                if has_placeholder
-                else str(len(full_source_ids))
-            )
-            truncation_info = (
-                f"{limit_method}:{len(source_ids)}/{full_source_count_str}"
-            )
-
-        if dd_message or truncation_info:
-            status_message += (
-                f" ({', '.join(filter(None, [truncation_info, dd_message]))})"
-            )
-
-        if already_fragment > 0 or llm_was_used:
-            logger.info(status_message)
-            if pipeline_status is not None and pipeline_status_lock is not None:
-                async with pipeline_status_lock:
-                    pipeline_status["latest_message"] = status_message
-                    pipeline_status["history_messages"].append(status_message)
-        else:
-            logger.debug(status_message)
-
-    else:
+    if not description_list:
         logger.error(f"Entity {entity_name} has no description")
-        description = "(no description)"
+        raise ValueError(f"Entity {entity_name} has no description")
 
-    source_id = GRAPH_FIELD_SEP.join(source_ids)
+    # 8. Get summary description an LLM usage status
+    description, llm_was_used = await _handle_entity_relation_summary(
+        "Entity",
+        entity_name,
+        description_list,
+        GRAPH_FIELD_SEP,
+        global_config,
+        llm_response_cache,
+    )
 
-    # Build file_path with count limit
-    if skip_summary_due_to_limit:
-        # Skip limit, keep original file_path
-        file_path = GRAPH_FIELD_SEP.join(fp for fp in already_file_paths if fp)
-    else:
-        # Collect and apply limit
-        file_paths_list = []
-        seen_paths = set()
-        has_placeholder = False  # Track if already_file_paths contains placeholder
+    # 9. Build file_path within MAX_FILE_PATHS
+    file_paths_list = []
+    seen_paths = set()
+    has_placeholder = False  # Indicating file_path has been truncated before
 
-        # Get placeholder to filter it out
+    max_file_paths = global_config.get("max_file_paths", DEFAULT_MAX_FILE_PATHS)
+    file_path_placeholder = global_config.get(
+        "file_path_more_placeholder", DEFAULT_FILE_PATH_MORE_PLACEHOLDER
+    )
+
+    # Collect from already_file_paths, excluding placeholder
+    for fp in already_file_paths:
+        if fp and fp.startswith(f"...{file_path_placeholder}"):  # Skip placeholders
+            has_placeholder = True
+            continue
+        if fp and fp not in seen_paths:
+            file_paths_list.append(fp)
+            seen_paths.add(fp)
+
+    # Collect from new data
+    for dp in nodes_data:
+        file_path_item = dp.get("file_path")
+        if file_path_item and file_path_item not in seen_paths:
+            file_paths_list.append(file_path_item)
+            seen_paths.add(file_path_item)
+
+    # Apply count limit
+    if len(file_paths_list) > max_file_paths:
+        limit_method = global_config.get(
+            "source_ids_limit_method", SOURCE_IDS_LIMIT_METHOD_KEEP
+        )
         file_path_placeholder = global_config.get(
             "file_path_more_placeholder", DEFAULT_FILE_PATH_MORE_PLACEHOLDER
         )
+        # Add + sign to indicate actual file count is higher
+        original_count_str = (
+            f"{len(file_paths_list)}+" if has_placeholder else str(len(file_paths_list))
+        )
 
-        # Collect from already_file_paths, excluding placeholder
-        for fp in already_file_paths:
-            # Check if this is a placeholder record
-            if fp and fp.startswith(f"...{file_path_placeholder}"):  # Skip placeholders
-                has_placeholder = True
-                continue
-            if fp and fp not in seen_paths:
-                file_paths_list.append(fp)
-                seen_paths.add(fp)
+        if limit_method == SOURCE_IDS_LIMIT_METHOD_FIFO:
+            # FIFO: keep tail (newest), discard head
+            file_paths_list = file_paths_list[-max_file_paths:]
+        else:
+            # KEEP: keep head (earliest), discard tail
+            file_paths_list = file_paths_list[:max_file_paths]
 
-        # Collect from new data
-        for dp in nodes_data:
-            file_path_item = dp.get("file_path")
-            if file_path_item and file_path_item not in seen_paths:
-                file_paths_list.append(file_path_item)
-                seen_paths.add(file_path_item)
+        file_paths_list.append(f"...{file_path_placeholder}({limit_method})...")
+        logger.info(
+            f"Limited `{entity_name}`: file_path {original_count_str} -> {max_file_paths} ({limit_method})"
+        )
+    # Finalize file_path
+    file_path = GRAPH_FIELD_SEP.join(file_paths_list)
 
-        # Apply count limit
-        max_file_paths = global_config.get("max_file_paths")
+    # 10.Log based on actual LLM usage
+    num_fragment = len(description_list)
+    already_fragment = len(already_description)
+    if llm_was_used:
+        status_message = f"LLMmrg: `{entity_name}` | {already_fragment}+{num_fragment - already_fragment}"
+    else:
+        status_message = f"Merged: `{entity_name}` | {already_fragment}+{num_fragment - already_fragment}"
 
-        if len(file_paths_list) > max_file_paths:
-            limit_method = global_config.get(
-                "source_ids_limit_method", SOURCE_IDS_LIMIT_METHOD_KEEP
-            )
-            file_path_placeholder = global_config.get(
-                "file_path_more_placeholder", DEFAULT_FILE_PATH_MORE_PLACEHOLDER
-            )
-            original_count = len(file_paths_list)
+    truncation_info = truncation_info_log = ""
+    if len(source_ids) < len(full_source_ids):
+        # Add truncation info from apply_source_ids_limit if truncation occurred
+        truncation_info_log = f"{limit_method} {len(source_ids)}/{len(full_source_ids)}"
+        if limit_method == SOURCE_IDS_LIMIT_METHOD_FIFO:
+            truncation_info = truncation_info_log
+        else:
+            truncation_info = "Keep Old Chunks"
 
-            if limit_method == SOURCE_IDS_LIMIT_METHOD_FIFO:
-                # FIFO: keep tail (newest), discard head
-                file_paths_list = file_paths_list[-max_file_paths:]
-            else:
-                # KEEP: keep head (earliest), discard tail
-                file_paths_list = file_paths_list[:max_file_paths]
+    deduplicated_num = already_fragment + len(nodes_data) - num_fragment
+    dd_message = ""
+    if deduplicated_num > 0:
+        # Duplicated description detected across multiple trucks for the same entity
+        dd_message = f"dd {deduplicated_num}"
 
-            file_paths_list.append(
-                f"...{file_path_placeholder}({limit_method}:{max_file_paths}/{original_count})..."
-            )
-            logger.info(
-                f"Limited `{entity_name}`: file_path {original_count} -> {max_file_paths} ({limit_method})"
-            )
+    if dd_message or truncation_info_log:
+        status_message += (
+            f" ({', '.join(filter(None, [truncation_info_log, dd_message]))})"
+        )
 
-        file_path = GRAPH_FIELD_SEP.join(file_paths_list)
+    # Add message to pipeline satus when merge happens
+    if already_fragment > 0 or llm_was_used:
+        logger.info(status_message)
+        if pipeline_status is not None and pipeline_status_lock is not None:
+            async with pipeline_status_lock:
+                pipeline_status["latest_message"] = status_message
+                pipeline_status["history_messages"].append(status_message)
+    else:
+        logger.debug(status_message)
 
+    # 11. Update both graph and vector db
     node_data = dict(
         entity_id=entity_name,
         entity_type=entity_type,
@@ -1777,6 +1760,7 @@ async def _merge_edges_then_upsert(
     already_keywords = []
     already_file_paths = []
 
+    # 1. Get existing edge data from graph storage
     if await knowledge_graph_inst.has_edge(src_id, tgt_id):
         already_edge = await knowledge_graph_inst.get_edge(src_id, tgt_id)
         # Handle the case where get_edge returns None or missing fields
@@ -1826,6 +1810,7 @@ async def _merge_edges_then_upsert(
             chunk_id for chunk_id in already_source_ids if chunk_id
         ]
 
+    # 2. Merge new source ids with existing ones
     full_source_ids = merge_source_ids(existing_full_source_ids, new_source_ids)
 
     if relation_chunks_storage is not None and full_source_ids:
@@ -1838,6 +1823,7 @@ async def _merge_edges_then_upsert(
             }
         )
 
+    # 3. Finalize source_id by applying source ids limit
     limit_method = global_config.get("source_ids_limit_method")
     max_source_limit = global_config.get("max_source_ids_per_relation")
     source_ids = apply_source_ids_limit(
@@ -1850,7 +1836,7 @@ async def _merge_edges_then_upsert(
         global_config.get("source_ids_limit_method") or SOURCE_IDS_LIMIT_METHOD_KEEP
     )
 
-    # Only apply filtering in KEEP(ignore new) mode
+    # 4. Only keep edges with source_id in the final source_ids list if in KEEP mode
     if limit_method == SOURCE_IDS_LIMIT_METHOD_KEEP:
         allowed_source_ids = set(source_ids)
         filtered_edges = []
@@ -1865,21 +1851,49 @@ async def _merge_edges_then_upsert(
                 continue
             filtered_edges.append(dp)
         edges_data = filtered_edges
-    else:
-        # In FIFO mode, keep all edge descriptions - truncation happens at source_ids level only
+    else:  # In FIFO mode, keep all edges - truncation happens at source_ids level only
         edges_data = list(edges_data)
 
-    skip_summary_due_to_limit = (
+    # 5. Check if we need to skip summary due to source_ids limit
+    if (
         limit_method == SOURCE_IDS_LIMIT_METHOD_KEEP
         and len(existing_full_source_ids) >= max_source_limit
         and not edges_data
-        and already_description
-    )
+    ):
+        if already_edge:
+            logger.info(f"Skipped `{src_id}`~`{tgt_id}`: KEEP old chunks")
+            existing_edge_data = dict(already_edge)
+            return existing_edge_data
+        else:
+            logger.error(
+                f"Internal Error: already_node missing for `{src_id}`~`{tgt_id}`"
+            )
+            raise ValueError(
+                f"Internal Error: already_node missing for `{src_id}`~`{tgt_id}`"
+            )
 
-    # Process edges_data with None checks
+    # 6.1 Finalize source_id
+    source_id = GRAPH_FIELD_SEP.join(source_ids)
+
+    # 6.2 Finalize weight by summing new edges and existing weights
     weight = sum([dp["weight"] for dp in edges_data] + already_weights)
 
-    # Deduplicate by description, keeping first occurrence
+    # 6.2 Finalize keywords by merging existing and new keywords
+    all_keywords = set()
+    # Process already_keywords (which are comma-separated)
+    for keyword_str in already_keywords:
+        if keyword_str:  # Skip empty strings
+            all_keywords.update(k.strip() for k in keyword_str.split(",") if k.strip())
+    # Process new keywords from edges_data
+    for edge in edges_data:
+        if edge.get("keywords"):
+            all_keywords.update(
+                k.strip() for k in edge["keywords"].split(",") if k.strip()
+            )
+    # Join all unique keywords with commas
+    keywords = ",".join(sorted(all_keywords))
+
+    # 7. Deduplicate by description, keeping first occurrence in the same document
     unique_edges = {}
     for dp in edges_data:
         description_value = dp.get("description")
@@ -1895,165 +1909,122 @@ async def _merge_edges_then_upsert(
     )
     sorted_descriptions = [dp["description"] for dp in sorted_edges]
 
-    truncation_info = ""
-    dd_message = ""
-    has_placeholder = False  # Initialize to track placeholder in file paths
-
     # Combine already_description with sorted new descriptions
     description_list = already_description + sorted_descriptions
+    if not description_list:
+        logger.error(f"Relation {src_id}~{tgt_id} has no description")
+        raise ValueError(f"Relation {src_id}~{tgt_id} has no description")
 
-    num_fragment = len(description_list)
-    already_fragment = len(already_description)
-    deduplicated_num = already_fragment + len(edges_data) - num_fragment
-    if deduplicated_num > 0:
-        dd_message = f"dd:{deduplicated_num}"
+    # 8. Get summary description an LLM usage status
+    description, llm_was_used = await _handle_entity_relation_summary(
+        "Relation",
+        f"({src_id}, {tgt_id})",
+        description_list,
+        GRAPH_FIELD_SEP,
+        global_config,
+        llm_response_cache,
+    )
 
-    if skip_summary_due_to_limit:
-        logger.info(f"Skipped `{src_id}`~`{tgt_id}`: KEEP old chunks")
-        description = (
-            already_edge.get("description", "(no description)")
-            if already_edge
-            else "(no description)"
+    # 9. Build file_path within MAX_FILE_PATHS limit
+    file_paths_list = []
+    seen_paths = set()
+    has_placeholder = False  # Track if already_file_paths contains placeholder
+
+    max_file_paths = global_config.get("max_file_paths", DEFAULT_MAX_FILE_PATHS)
+    file_path_placeholder = global_config.get(
+        "file_path_more_placeholder", DEFAULT_FILE_PATH_MORE_PLACEHOLDER
+    )
+
+    # Collect from already_file_paths, excluding placeholder
+    for fp in already_file_paths:
+        # Check if this is a placeholder record
+        if fp and fp.startswith(f"...{file_path_placeholder}"):  # Skip placeholders
+            has_placeholder = True
+            continue
+        if fp and fp not in seen_paths:
+            file_paths_list.append(fp)
+            seen_paths.add(fp)
+
+    # Collect from new data
+    for dp in edges_data:
+        file_path_item = dp.get("file_path")
+        if file_path_item and file_path_item not in seen_paths:
+            file_paths_list.append(file_path_item)
+            seen_paths.add(file_path_item)
+
+    # Apply count limit
+    max_file_paths = global_config.get("max_file_paths")
+
+    if len(file_paths_list) > max_file_paths:
+        limit_method = global_config.get(
+            "source_ids_limit_method", SOURCE_IDS_LIMIT_METHOD_KEEP
         )
-        existing_edge_data = dict(already_edge or {})
-        if not existing_edge_data:
-            existing_edge_data = {
-                "description": description,
-                "keywords": GRAPH_FIELD_SEP.join(already_keywords),
-                "source_id": GRAPH_FIELD_SEP.join(existing_full_source_ids),
-                "file_path": GRAPH_FIELD_SEP.join(already_file_paths),
-                "weight": sum(already_weights) if already_weights else 0.0,
-                "truncate": "",
-                "created_at": int(time.time()),
-            }
-        existing_edge_data.setdefault("created_at", int(time.time()))
-        existing_edge_data["src_id"] = src_id
-        existing_edge_data["tgt_id"] = tgt_id
-        return existing_edge_data
-    elif num_fragment > 0:
-        # Get summary and LLM usage status
-        description, llm_was_used = await _handle_entity_relation_summary(
-            "Relation",
-            f"({src_id}, {tgt_id})",
-            description_list,
-            GRAPH_FIELD_SEP,
-            global_config,
-            llm_response_cache,
-        )
-
-        # Log based on actual LLM usage
-        if llm_was_used:
-            status_message = f"LLMmrg: `{src_id}`~`{tgt_id}` | {already_fragment}+{num_fragment - already_fragment}"
-        else:
-            status_message = f"Merged: `{src_id}`~`{tgt_id}` | {already_fragment}+{num_fragment - already_fragment}"
-
-        # Add truncation info from apply_source_ids_limit if truncation occurred
-        if len(source_ids) < len(full_source_ids):
-            # Add + sign if has_placeholder is True, indicating actual file count is higher
-            full_source_count_str = (
-                f"{len(full_source_ids)}+"
-                if has_placeholder
-                else str(len(full_source_ids))
-            )
-            truncation_info = (
-                f"{limit_method}:{len(source_ids)}/{full_source_count_str}"
-            )
-
-        if dd_message or truncation_info:
-            status_message += (
-                f" ({', '.join(filter(None, [truncation_info, dd_message]))})"
-            )
-
-        if already_fragment > 0 or llm_was_used:
-            logger.info(status_message)
-            if pipeline_status is not None and pipeline_status_lock is not None:
-                async with pipeline_status_lock:
-                    pipeline_status["latest_message"] = status_message
-                    pipeline_status["history_messages"].append(status_message)
-        else:
-            logger.debug(status_message)
-
-    else:
-        logger.error(f"Edge {src_id} - {tgt_id} has no description")
-        description = "(no description)"
-
-    # Split all existing and new keywords into individual terms, then combine and deduplicate
-    all_keywords = set()
-    # Process already_keywords (which are comma-separated)
-    for keyword_str in already_keywords:
-        if keyword_str:  # Skip empty strings
-            all_keywords.update(k.strip() for k in keyword_str.split(",") if k.strip())
-    # Process new keywords from edges_data
-    for edge in edges_data:
-        if edge.get("keywords"):
-            all_keywords.update(
-                k.strip() for k in edge["keywords"].split(",") if k.strip()
-            )
-    # Join all unique keywords with commas
-    keywords = ",".join(sorted(all_keywords))
-
-    source_id = GRAPH_FIELD_SEP.join(source_ids)
-
-    # Build file_path with count limit
-    if skip_summary_due_to_limit:
-        # Skip limit, keep original file_path
-        file_path = GRAPH_FIELD_SEP.join(fp for fp in already_file_paths if fp)
-    else:
-        # Collect and apply limit
-        file_paths_list = []
-        seen_paths = set()
-        has_placeholder = False  # Track if already_file_paths contains placeholder
-
-        # Get placeholder to filter it out
         file_path_placeholder = global_config.get(
             "file_path_more_placeholder", DEFAULT_FILE_PATH_MORE_PLACEHOLDER
         )
 
-        # Collect from already_file_paths, excluding placeholder
-        for fp in already_file_paths:
-            # Check if this is a placeholder record
-            if fp and fp.startswith(f"...{file_path_placeholder}"):  # Skip placeholders
-                has_placeholder = True
-                continue
-            if fp and fp not in seen_paths:
-                file_paths_list.append(fp)
-                seen_paths.add(fp)
+        # Add + sign to indicate actual file count is higher
+        original_count_str = (
+            f"{len(file_paths_list)}+" if has_placeholder else str(len(file_paths_list))
+        )
 
-        # Collect from new data
-        for dp in edges_data:
-            file_path_item = dp.get("file_path")
-            if file_path_item and file_path_item not in seen_paths:
-                file_paths_list.append(file_path_item)
-                seen_paths.add(file_path_item)
+        if limit_method == SOURCE_IDS_LIMIT_METHOD_FIFO:
+            # FIFO: keep tail (newest), discard head
+            file_paths_list = file_paths_list[-max_file_paths:]
+        else:
+            # KEEP: keep head (earliest), discard tail
+            file_paths_list = file_paths_list[:max_file_paths]
 
-        # Apply count limit
-        max_file_paths = global_config.get("max_file_paths")
+        # Add + sign if has_placeholder is True, indicating actual file count is higher
 
-        if len(file_paths_list) > max_file_paths:
-            limit_method = global_config.get(
-                "source_ids_limit_method", SOURCE_IDS_LIMIT_METHOD_KEEP
-            )
-            file_path_placeholder = global_config.get(
-                "file_path_more_placeholder", DEFAULT_FILE_PATH_MORE_PLACEHOLDER
-            )
-            original_count = len(file_paths_list)
+        file_paths_list.append(
+            f"...{file_path_placeholder}({limit_method}:{max_file_paths}/{original_count_str})..."
+        )
+        logger.info(
+            f"Limited `{src_id}`~`{tgt_id}`: file_path {original_count_str} -> {max_file_paths} ({limit_method})"
+        )
+    # Finalize file_path
+    file_path = GRAPH_FIELD_SEP.join(file_paths_list)
 
-            if limit_method == SOURCE_IDS_LIMIT_METHOD_FIFO:
-                # FIFO: keep tail (newest), discard head
-                file_paths_list = file_paths_list[-max_file_paths:]
-            else:
-                # KEEP: keep head (earliest), discard tail
-                file_paths_list = file_paths_list[:max_file_paths]
+    # 10. Log based on actual LLM usage
+    num_fragment = len(description_list)
+    already_fragment = len(already_description)
+    if llm_was_used:
+        status_message = f"LLMmrg: `{src_id}`~`{tgt_id}` | {already_fragment}+{num_fragment - already_fragment}"
+    else:
+        status_message = f"Merged: `{src_id}`~`{tgt_id}` | {already_fragment}+{num_fragment - already_fragment}"
 
-            file_paths_list.append(
-                f"...{file_path_placeholder}({limit_method}:{max_file_paths}/{original_count})..."
-            )
-            logger.info(
-                f"Limited `{src_id}`~`{tgt_id}`: file_path {original_count} -> {max_file_paths} ({limit_method})"
-            )
+    truncation_info = truncation_info_log = ""
+    if len(source_ids) < len(full_source_ids):
+        # Add truncation info from apply_source_ids_limit if truncation occurred
+        truncation_info_log = f"{limit_method} {len(source_ids)}/{len(full_source_ids)}"
+        if limit_method == SOURCE_IDS_LIMIT_METHOD_FIFO:
+            truncation_info = truncation_info_log
+        else:
+            truncation_info = "Keep Old Chunks"
 
-        file_path = GRAPH_FIELD_SEP.join(file_paths_list)
+    deduplicated_num = already_fragment + len(edges_data) - num_fragment
+    dd_message = ""
+    if deduplicated_num > 0:
+        # Duplicated description detected across multiple trucks for the same entity
+        dd_message = f"dd {deduplicated_num}"
 
+    if dd_message or truncation_info_log:
+        status_message += (
+            f" ({', '.join(filter(None, [truncation_info_log, dd_message]))})"
+        )
+
+    # Add message to pipeline satus when merge happens
+    if already_fragment > 0 or llm_was_used:
+        logger.info(status_message)
+        if pipeline_status is not None and pipeline_status_lock is not None:
+            async with pipeline_status_lock:
+                pipeline_status["latest_message"] = status_message
+                pipeline_status["history_messages"].append(status_message)
+    else:
+        logger.debug(status_message)
+
+    # 11. Update both graph and vector db
     for need_insert_id in [src_id, tgt_id]:
         if not (await knowledge_graph_inst.has_node(need_insert_id)):
             node_created_at = int(time.time())

From 80668aae229d07f6404a789925ae8d2897359325 Mon Sep 17 00:00:00 2001
From: yangdx <gzdaniel@me.com>
Date: Tue, 21 Oct 2025 15:39:31 +0800
Subject: [PATCH 19/25] Improve file path truncation labels and UI consistency
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

• Standardize FIFO/KEEP truncation labels
• Update UI truncation text format
---
 lightrag/operate.py                                | 14 ++++++--------
 .../src/components/graph/PropertiesView.tsx        |  2 +-
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/lightrag/operate.py b/lightrag/operate.py
index 60fa66a3..c3816cec 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -1650,11 +1650,12 @@ async def _merge_nodes_then_upsert(
         if limit_method == SOURCE_IDS_LIMIT_METHOD_FIFO:
             # FIFO: keep tail (newest), discard head
             file_paths_list = file_paths_list[-max_file_paths:]
+            file_paths_list.append(f"...{file_path_placeholder}...(FIFO)")
         else:
             # KEEP: keep head (earliest), discard tail
             file_paths_list = file_paths_list[:max_file_paths]
+            file_paths_list.append(f"...{file_path_placeholder}...(KEEP Old)")
 
-        file_paths_list.append(f"...{file_path_placeholder}({limit_method})...")
         logger.info(
             f"Limited `{entity_name}`: file_path {original_count_str} -> {max_file_paths} ({limit_method})"
         )
@@ -1676,7 +1677,7 @@ async def _merge_nodes_then_upsert(
         if limit_method == SOURCE_IDS_LIMIT_METHOD_FIFO:
             truncation_info = truncation_info_log
         else:
-            truncation_info = "Keep Old Chunks"
+            truncation_info = "KEEP Old"
 
     deduplicated_num = already_fragment + len(nodes_data) - num_fragment
     dd_message = ""
@@ -1971,15 +1972,12 @@ async def _merge_edges_then_upsert(
         if limit_method == SOURCE_IDS_LIMIT_METHOD_FIFO:
             # FIFO: keep tail (newest), discard head
             file_paths_list = file_paths_list[-max_file_paths:]
+            file_paths_list.append(f"...{file_path_placeholder}...(FIFO)")
         else:
             # KEEP: keep head (earliest), discard tail
             file_paths_list = file_paths_list[:max_file_paths]
+            file_paths_list.append(f"...{file_path_placeholder}...(KEEP Old)")
 
-        # Add + sign if has_placeholder is True, indicating actual file count is higher
-
-        file_paths_list.append(
-            f"...{file_path_placeholder}({limit_method}:{max_file_paths}/{original_count_str})..."
-        )
         logger.info(
             f"Limited `{src_id}`~`{tgt_id}`: file_path {original_count_str} -> {max_file_paths} ({limit_method})"
         )
@@ -2001,7 +1999,7 @@ async def _merge_edges_then_upsert(
         if limit_method == SOURCE_IDS_LIMIT_METHOD_FIFO:
             truncation_info = truncation_info_log
         else:
-            truncation_info = "Keep Old Chunks"
+            truncation_info = "KEEP Old"
 
     deduplicated_num = already_fragment + len(edges_data) - num_fragment
     dd_message = ""
diff --git a/lightrag_webui/src/components/graph/PropertiesView.tsx b/lightrag_webui/src/components/graph/PropertiesView.tsx
index 39b9a448..97411f29 100644
--- a/lightrag_webui/src/components/graph/PropertiesView.tsx
+++ b/lightrag_webui/src/components/graph/PropertiesView.tsx
@@ -222,7 +222,7 @@ const PropertyRow = ({
 
   // If this is source_id field and truncate info exists, append it to the tooltip
   if (name === 'source_id' && truncate) {
-    formattedTooltip += `\n(Truncation-${truncate})`
+    formattedTooltip += `\n(Truncated: ${truncate})`
   }
 
   // Use EditablePropertyRow for editable fields (description, entity_id and keywords)

From 3ad616be4f2b4e4f35f2904b6ae3264a322a0800 Mon Sep 17 00:00:00 2001
From: yangdx <gzdaniel@me.com>
Date: Tue, 21 Oct 2025 16:12:11 +0800
Subject: [PATCH 20/25] Change default source IDs limit method from KEEP to
 FIFO

---
 env.example           | 6 ++++--
 lightrag/constants.py | 9 ++++++---
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/env.example b/env.example
index 73f2d7b7..4b86d79f 100644
--- a/env.example
+++ b/env.example
@@ -138,8 +138,10 @@ SUMMARY_LANGUAGE=English
 ### control the maximum chunk_ids stored in vector and graph db
 # MAX_SOURCE_IDS_PER_ENTITY=300
 # MAX_SOURCE_IDS_PER_RELATION=300
-### control chunk_ids limitation method: KEEP, FIFO (KEEP: Keep oldest, FIFO: First in first out)
-# SOURCE_IDS_LIMIT_METHOD=KEEP
+### control chunk_ids limitation method: FIFO, FIFO
+###    FIFO: First in first out
+###    KEEP: Keep oldest (less merge action and faster)
+# SOURCE_IDS_LIMIT_METHOD=FIFO
 ### Maximum number of file paths stored in entity/relation file_path field
 # MAX_FILE_PATHS=30
 
diff --git a/lightrag/constants.py b/lightrag/constants.py
index 7c2b2701..f4e06e11 100644
--- a/lightrag/constants.py
+++ b/lightrag/constants.py
@@ -60,9 +60,12 @@ DEFAULT_RERANK_BINDING = "null"
 # Default source ids limit in meta data for entity and relation
 DEFAULT_MAX_SOURCE_IDS_PER_ENTITY = 300
 DEFAULT_MAX_SOURCE_IDS_PER_RELATION = 300
-SOURCE_IDS_LIMIT_METHOD_KEEP = "KEEP"  # Keep oldest
-SOURCE_IDS_LIMIT_METHOD_FIFO = "FIFO"  # First In First Out (Keep newest)
-DEFAULT_SOURCE_IDS_LIMIT_METHOD = SOURCE_IDS_LIMIT_METHOD_KEEP
+### control chunk_ids limitation method: FIFO, FIFO
+###    FIFO: First in first out
+###    KEEP: Keep oldest (less merge action and faster)
+SOURCE_IDS_LIMIT_METHOD_KEEP = "KEEP"
+SOURCE_IDS_LIMIT_METHOD_FIFO = "FIFO"
+DEFAULT_SOURCE_IDS_LIMIT_METHOD = SOURCE_IDS_LIMIT_METHOD_FIFO
 VALID_SOURCE_IDS_LIMIT_METHODS = {
     SOURCE_IDS_LIMIT_METHOD_KEEP,
     SOURCE_IDS_LIMIT_METHOD_FIFO,

From 3ed2abd82c1b0335c652d1f46edebcdb04b2bc1d Mon Sep 17 00:00:00 2001
From: yangdx <gzdaniel@me.com>
Date: Tue, 21 Oct 2025 16:20:34 +0800
Subject: [PATCH 21/25] Improve logging to show source ID ratios when skipping
 entities/edges

---
 lightrag/operate.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/lightrag/operate.py b/lightrag/operate.py
index c3816cec..c5f370f3 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -1555,7 +1555,9 @@ async def _merge_nodes_then_upsert(
         and not nodes_data
     ):
         if already_node:
-            logger.info(f"Skipped `{entity_name}`: KEEP old chunks")
+            logger.info(
+                f"Skipped `{entity_name}`: KEEP old chunks {already_source_ids}/{len(full_source_ids)}"
+            )
             existing_node_data = dict(already_node)
             return existing_node_data
         else:
@@ -1862,7 +1864,9 @@ async def _merge_edges_then_upsert(
         and not edges_data
     ):
         if already_edge:
-            logger.info(f"Skipped `{src_id}`~`{tgt_id}`: KEEP old chunks")
+            logger.info(
+                f"Skipped `{src_id}`~`{tgt_id}`: KEEP old chunks  {already_source_ids}/{len(full_source_ids)}"
+            )
             existing_edge_data = dict(already_edge)
             return existing_edge_data
         else:

From e5e16b7bd171848c5b973a24eda53ae636ffea31 Mon Sep 17 00:00:00 2001
From: yangdx <gzdaniel@me.com>
Date: Tue, 21 Oct 2025 16:27:04 +0800
Subject: [PATCH 22/25] Fix Redis data migration error
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

• Use proper Redis connection context
• Fix namespace pattern for key scanning
• Propagate storage check exceptions
• Remove defensive error swallowing
---
 lightrag/kg/redis_impl.py | 11 ++++++-----
 lightrag/lightrag.py      |  4 ++--
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/lightrag/kg/redis_impl.py b/lightrag/kg/redis_impl.py
index 8a393497..2e9a7d43 100644
--- a/lightrag/kg/redis_impl.py
+++ b/lightrag/kg/redis_impl.py
@@ -368,12 +368,13 @@ class RedisKVStorage(BaseKVStorage):
         Returns:
             bool: True if storage is empty, False otherwise
         """
-        pattern = f"{self.namespace}:{self.workspace}:*"
+        pattern = f"{self.final_namespace}:*"
         try:
-            # Use scan to check if any keys exist
-            async for key in self.redis.scan_iter(match=pattern, count=1):
-                return False  # Found at least one key
-            return True  # No keys found
+            async with self._get_redis_connection() as redis:
+                # Use scan to check if any keys exist
+                async for key in redis.scan_iter(match=pattern, count=1):
+                    return False  # Found at least one key
+                return True  # No keys found
         except Exception as e:
             logger.error(f"[{self.workspace}] Error checking if storage is empty: {e}")
             return True
diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py
index 4380a276..afd1de76 100644
--- a/lightrag/lightrag.py
+++ b/lightrag/lightrag.py
@@ -887,13 +887,13 @@ class LightRAG:
             need_entity_migration = await self.entity_chunks.is_empty()
         except Exception as exc:  # pragma: no cover - defensive logging
             logger.error(f"Failed to check entity chunks storage: {exc}")
-            need_entity_migration = True
+            raise exc
 
         try:
             need_relation_migration = await self.relation_chunks.is_empty()
         except Exception as exc:  # pragma: no cover - defensive logging
             logger.error(f"Failed to check relation chunks storage: {exc}")
-            need_relation_migration = True
+            raise exc
 
         if not need_entity_migration and not need_relation_migration:
             return

From 88a45523e26f799e9e51d149301d7923f14350a5 Mon Sep 17 00:00:00 2001
From: yangdx <gzdaniel@me.com>
Date: Tue, 21 Oct 2025 17:33:00 +0800
Subject: [PATCH 23/25] Increase default max file paths from 30 to 100 and
 improve documentation

- Bump DEFAULT_MAX_FILE_PATHS to 100
- Add clarifying comment about display
---
 env.example           | 5 +++--
 lightrag/constants.py | 4 ++--
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/env.example b/env.example
index 4b86d79f..3fd824ba 100644
--- a/env.example
+++ b/env.example
@@ -142,8 +142,9 @@ SUMMARY_LANGUAGE=English
 ###    FIFO: First in first out
 ###    KEEP: Keep oldest (less merge action and faster)
 # SOURCE_IDS_LIMIT_METHOD=FIFO
-### Maximum number of file paths stored in entity/relation file_path field
-# MAX_FILE_PATHS=30
+
+# Maximum number of file paths stored in entity/relation file_path field (For displayed only, does not affect query performance)
+# MAX_FILE_PATHS=100
 
 ### maximum number of related chunks per source entity or relation
 ###     The chunk picker uses this value to determine the total number of chunks selected from KG(knowledge graph)
diff --git a/lightrag/constants.py b/lightrag/constants.py
index f4e06e11..0d02edbf 100644
--- a/lightrag/constants.py
+++ b/lightrag/constants.py
@@ -70,8 +70,8 @@ VALID_SOURCE_IDS_LIMIT_METHODS = {
     SOURCE_IDS_LIMIT_METHOD_KEEP,
     SOURCE_IDS_LIMIT_METHOD_FIFO,
 }
-# Default file_path limit in meta data for entity and relation (Use same limit method as source_ids)
-DEFAULT_MAX_FILE_PATHS = 30
+# Maximum number of file paths stored in entity/relation file_path field (For displayed only, does not affect query performance)
+DEFAULT_MAX_FILE_PATHS = 100
 
 # Field length of file_path in Milvus Schema for entity and relation (Should not be changed)
 # file_path must store all file paths up to the DEFAULT_MAX_FILE_PATHS limit within the metadata.

From fe890fca1598ce01ca7783069493c125ecd7fbb7 Mon Sep 17 00:00:00 2001
From: yangdx <gzdaniel@me.com>
Date: Tue, 21 Oct 2025 18:34:06 +0800
Subject: [PATCH 24/25] Improve formatting of limit method info in rebuild
 functions

---
 lightrag/operate.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/lightrag/operate.py b/lightrag/operate.py
index c5f370f3..d4a86977 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -1189,7 +1189,7 @@ async def _rebuild_single_entity(
             file_paths_list = file_paths_list[:max_file_paths]
 
         file_paths_list.append(
-            f"...{file_path_placeholder}({limit_method}:{max_file_paths}/{original_count})..."
+            f"...{file_path_placeholder}...({limit_method} {max_file_paths}/{original_count})"
         )
         logger.info(
             f"Limited `{entity_name}`: file_path {original_count} -> {max_file_paths} ({limit_method})"
@@ -1223,7 +1223,7 @@ async def _rebuild_single_entity(
 
     if len(limited_chunk_ids) < len(normalized_chunk_ids):
         truncation_info = (
-            f"{limit_method}:{len(limited_chunk_ids)}/{len(normalized_chunk_ids)}"
+            f"{limit_method} {len(limited_chunk_ids)}/{len(normalized_chunk_ids)}"
         )
     else:
         truncation_info = ""
@@ -1348,7 +1348,7 @@ async def _rebuild_single_relationship(
             file_paths_list = file_paths_list[:max_file_paths]
 
         file_paths_list.append(
-            f"...{file_path_placeholder}({limit_method}:{max_file_paths}/{original_count})..."
+            f"...{file_path_placeholder}...({limit_method} {max_file_paths}/{original_count})"
         )
         logger.info(
             f"Limited `{src}`~`{tgt}`: file_path {original_count} -> {max_file_paths} ({limit_method})"
@@ -1384,7 +1384,7 @@ async def _rebuild_single_relationship(
 
     if len(limited_chunk_ids) < len(normalized_chunk_ids):
         truncation_info = (
-            f"{limit_method}:{len(limited_chunk_ids)}/{len(normalized_chunk_ids)}"
+            f"{limit_method} {len(limited_chunk_ids)}/{len(normalized_chunk_ids)}"
         )
     else:
         truncation_info = ""

From a809245aed58fe68a1c1a75e0d63d7db7f327978 Mon Sep 17 00:00:00 2001
From: yangdx <gzdaniel@me.com>
Date: Tue, 21 Oct 2025 18:57:54 +0800
Subject: [PATCH 25/25] Preserve file path order by using lists instead of sets

---
 lightrag/operate.py | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/lightrag/operate.py b/lightrag/operate.py
index d4a86977..b3adb67d 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -1026,7 +1026,7 @@ async def _rebuild_single_entity(
     async def _update_entity_storage(
         final_description: str,
         entity_type: str,
-        file_paths: set[str],
+        file_paths: list[str],
         source_chunk_ids: list[str],
         truncation_info: str = "",
     ):
@@ -1195,8 +1195,6 @@ async def _rebuild_single_entity(
             f"Limited `{entity_name}`: file_path {original_count} -> {max_file_paths} ({limit_method})"
         )
 
-    file_paths = set(file_paths_list)
-
     # Remove duplicates while preserving order
     description_list = list(dict.fromkeys(descriptions))
     entity_types = list(dict.fromkeys(entity_types))
@@ -1231,7 +1229,7 @@ async def _rebuild_single_entity(
     await _update_entity_storage(
         final_description,
         entity_type,
-        file_paths,
+        file_paths_list,
         limited_chunk_ids,
         truncation_info,
     )
@@ -1354,8 +1352,6 @@ async def _rebuild_single_relationship(
             f"Limited `{src}`~`{tgt}`: file_path {original_count} -> {max_file_paths} ({limit_method})"
         )
 
-    file_paths = set(file_paths_list)
-
     # Remove duplicates while preserving order
     description_list = list(dict.fromkeys(descriptions))
     keywords = list(dict.fromkeys(keywords))
@@ -1398,8 +1394,8 @@ async def _rebuild_single_relationship(
         "keywords": combined_keywords,
         "weight": weight,
         "source_id": GRAPH_FIELD_SEP.join(limited_chunk_ids),
-        "file_path": GRAPH_FIELD_SEP.join([fp for fp in file_paths if fp])
-        if file_paths
+        "file_path": GRAPH_FIELD_SEP.join([fp for fp in file_paths_list if fp])
+        if file_paths_list
         else current_relationship.get("file_path", "unknown_source"),
         "truncate": truncation_info,
     }