From 7871600d8a101504f06566c87bf6bc9125206330 Mon Sep 17 00:00:00 2001
From: DivinesLight <munir@loop.pk>
Date: Tue, 14 Oct 2025 14:47:04 +0500
Subject: [PATCH 1/3] Quick fix to limit source_id ballooning while inserting
 nodes

---
 lightrag/constants.py |  1 +
 lightrag/operate.py   | 13 ++++++++++---
 lightrag/utils.py     | 15 +++++++++++++++
 3 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/lightrag/constants.py b/lightrag/constants.py
index 14584559..6fb9feb4 100644
--- a/lightrag/constants.py
+++ b/lightrag/constants.py
@@ -13,6 +13,7 @@ DEFAULT_MAX_GRAPH_NODES = 1000
 # Default values for extraction settings
 DEFAULT_SUMMARY_LANGUAGE = "English"  # Default language for document processing
 DEFAULT_MAX_GLEANING = 1
+DEFAULT_MAX_CHUNK_IDS_PER_ENTITY = 500 # Applies to Both Graph + Vector DBs
 
 # Number of description fragments to trigger LLM summary
 DEFAULT_FORCE_LLM_SUMMARY_ON_MERGE = 8
diff --git a/lightrag/operate.py b/lightrag/operate.py
index a12cb63f..cee8f377 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -27,6 +27,7 @@ from .utils import (
     pick_by_vector_similarity,
     process_chunks_unified,
     build_file_path,
+    truncate_entity_source_id,
     safe_vdb_operation_with_exception,
     create_prefixed_exception,
     fix_tuple_delimiter_corruption,
@@ -52,6 +53,7 @@ from .constants import (
     DEFAULT_KG_CHUNK_PICK_METHOD,
     DEFAULT_ENTITY_TYPES,
     DEFAULT_SUMMARY_LANGUAGE,
+    DEFAULT_MAX_CHUNK_IDS_PER_ENTITY,
 )
 from .kg.shared_storage import get_storage_keyed_lock
 import time
@@ -1371,9 +1373,11 @@ async def _merge_nodes_then_upsert(
         logger.error(f"Entity {entity_name} has no description")
         description = "(no description)"
 
-    source_id = GRAPH_FIELD_SEP.join(
-        set([dp["source_id"] for dp in nodes_data] + already_source_ids)
-    )
+    merged_source_ids: set = set([dp["source_id"] for dp in nodes_data] + already_source_ids)
+
+    source_ids = truncate_entity_source_id(merged_source_ids, entity_name)
+    source_id = GRAPH_FIELD_SEP.join(source_ids)
+
     file_path = build_file_path(already_file_paths, nodes_data, entity_name)
 
     node_data = dict(
@@ -1658,6 +1662,7 @@ async def merge_nodes_and_edges(
                 [entity_name], namespace=namespace, enable_logging=False
             ):
                 try:
+                    logger.info(f"Inserting {entity_name} in Graph")
                     # Graph database operation (critical path, must succeed)
                     entity_data = await _merge_nodes_then_upsert(
                         entity_name,
@@ -1685,6 +1690,8 @@ async def merge_nodes_and_edges(
                             }
                         }
 
+
+                        logger.info(f"Inserting {entity_name} in Graph")
                         # Use safe operation wrapper - VDB failure must throw exception
                         await safe_vdb_operation_with_exception(
                             operation=lambda: entity_vdb.upsert(data_for_vdb),
diff --git a/lightrag/utils.py b/lightrag/utils.py
index 83a3c394..17ee43a6 100644
--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@@ -26,6 +26,7 @@ from lightrag.constants import (
     GRAPH_FIELD_SEP,
     DEFAULT_MAX_TOTAL_TOKENS,
     DEFAULT_MAX_FILE_PATH_LENGTH,
+    DEFAULT_MAX_CHUNK_IDS_PER_ENTITY,
 )
 
 # Initialize logger with basic configuration
@@ -2464,6 +2465,20 @@ async def process_chunks_unified(
 
     return final_chunks
 
+def truncate_entity_source_id(chunk_ids: set, entity_name: str) -> set:
+    """Limit chunk_ids, for entities that appear a HUGE no of times (To not break VDB hard upper limits)"""
+    already_len: int = len(chunk_ids)
+
+    if already_len >= DEFAULT_MAX_CHUNK_IDS_PER_ENTITY:
+        logger.warning(
+            f"Chunk Ids already exceeds {DEFAULT_MAX_CHUNK_IDS_PER_ENTITY} for {entity_name}, "
+            f"current size: {already_len} entries."
+        )
+    
+    truncated_chunk_ids = set(list(chunk_ids)[0:DEFAULT_MAX_CHUNK_IDS_PER_ENTITY])
+
+    return truncated_chunk_ids
+
 
 def build_file_path(already_file_paths, data_list, target):
     """Build file path string with UTF-8 byte length limit and deduplication

From 4e740af79b538653d127708aecfc9a109d276d17 Mon Sep 17 00:00:00 2001
From: haseebuchiha <haseeb9009@gmail.com>
Date: Tue, 14 Oct 2025 16:14:03 +0500
Subject: [PATCH 2/3] Import from env and use default if none and removed
 useless import

---
 env.example         | 2 ++
 lightrag/operate.py | 1 -
 lightrag/utils.py   | 8 +++++---
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/env.example b/env.example
index 4c8d355d..1d2b81f3 100644
--- a/env.example
+++ b/env.example
@@ -73,6 +73,8 @@ ENABLE_LLM_CACHE=true
 # MAX_RELATION_TOKENS=8000
 ### control the maximum tokens send to LLM (include entities, relations and chunks)
 # MAX_TOTAL_TOKENS=30000
+### control the maximum chunk_ids stored in vector db
+# MAX_CHUNK_IDS_PER_ENTITY=500
 
 ### maximum number of related chunks per source entity or relation
 ###     The chunk picker uses this value to determine the total number of chunks selected from KG(knowledge graph)
diff --git a/lightrag/operate.py b/lightrag/operate.py
index cee8f377..0476d169 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -53,7 +53,6 @@ from .constants import (
     DEFAULT_KG_CHUNK_PICK_METHOD,
     DEFAULT_ENTITY_TYPES,
     DEFAULT_SUMMARY_LANGUAGE,
-    DEFAULT_MAX_CHUNK_IDS_PER_ENTITY,
 )
 from .kg.shared_storage import get_storage_keyed_lock
 import time
diff --git a/lightrag/utils.py b/lightrag/utils.py
index 17ee43a6..b33c5a15 100644
--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@@ -2469,13 +2469,15 @@ def truncate_entity_source_id(chunk_ids: set, entity_name: str) -> set:
     """Limit chunk_ids, for entities that appear a HUGE no of times (To not break VDB hard upper limits)"""
     already_len: int = len(chunk_ids)
 
-    if already_len >= DEFAULT_MAX_CHUNK_IDS_PER_ENTITY:
+    max_chunk_ids_per_entity = get_env_value("MAX_CHUNK_IDS_PER_ENTITY", DEFAULT_MAX_CHUNK_IDS_PER_ENTITY, int)
+
+    if already_len >= max_chunk_ids_per_entity:
         logger.warning(
-            f"Chunk Ids already exceeds {DEFAULT_MAX_CHUNK_IDS_PER_ENTITY} for {entity_name}, "
+            f"Chunk Ids already exceeds {max_chunk_ids_per_entity } for {entity_name}, "
             f"current size: {already_len} entries."
         )
     
-    truncated_chunk_ids = set(list(chunk_ids)[0:DEFAULT_MAX_CHUNK_IDS_PER_ENTITY])
+    truncated_chunk_ids = set(list(chunk_ids)[0:max_chunk_ids_per_entity ])
 
     return truncated_chunk_ids
 

From 17c2a929d2c6aa4e5767f21e4b20348be72b3184 Mon Sep 17 00:00:00 2001
From: DivinesLight <munir@loop.pk>
Date: Wed, 15 Oct 2025 18:24:38 +0500
Subject: [PATCH 3/3] Get max source Id config from .env and lightRAG init

---
 env.example           |  4 ++--
 lightrag/constants.py |  2 +-
 lightrag/lightrag.py  |  6 ++++++
 lightrag/operate.py   |  6 +++---
 lightrag/utils.py     | 21 +++++++++++----------
 5 files changed, 23 insertions(+), 16 deletions(-)

diff --git a/env.example b/env.example
index 1d2b81f3..e0b649e3 100644
--- a/env.example
+++ b/env.example
@@ -73,8 +73,8 @@ ENABLE_LLM_CACHE=true
 # MAX_RELATION_TOKENS=8000
 ### control the maximum tokens send to LLM (include entities, relations and chunks)
 # MAX_TOTAL_TOKENS=30000
-### control the maximum chunk_ids stored in vector db
-# MAX_CHUNK_IDS_PER_ENTITY=500
+### control the maximum chunk_ids stored
+# MAX_SOURCE_IDS_PER_ENTITY=500
 
 ### maximum number of related chunks per source entity or relation
 ###     The chunk picker uses this value to determine the total number of chunks selected from KG(knowledge graph)
diff --git a/lightrag/constants.py b/lightrag/constants.py
index 6fb9feb4..f7b5c41f 100644
--- a/lightrag/constants.py
+++ b/lightrag/constants.py
@@ -13,7 +13,7 @@ DEFAULT_MAX_GRAPH_NODES = 1000
 # Default values for extraction settings
 DEFAULT_SUMMARY_LANGUAGE = "English"  # Default language for document processing
 DEFAULT_MAX_GLEANING = 1
-DEFAULT_MAX_CHUNK_IDS_PER_ENTITY = 500 # Applies to Both Graph + Vector DBs
+DEFAULT_MAX_SOURCE_IDS_PER_ENTITY = 500 # Applies to Both Graph + Vector DBs
 
 # Number of description fragments to trigger LLM summary
 DEFAULT_FORCE_LLM_SUMMARY_ON_MERGE = 8
diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py
index d288685e..2b18f961 100644
--- a/lightrag/lightrag.py
+++ b/lightrag/lightrag.py
@@ -39,6 +39,7 @@ from lightrag.constants import (
     DEFAULT_MAX_ASYNC,
     DEFAULT_MAX_PARALLEL_INSERT,
     DEFAULT_MAX_GRAPH_NODES,
+    DEFAULT_MAX_SOURCE_IDS_PER_ENTITY,
     DEFAULT_ENTITY_TYPES,
     DEFAULT_SUMMARY_LANGUAGE,
     DEFAULT_LLM_TIMEOUT,
@@ -359,6 +360,11 @@ class LightRAG:
     )
     """Maximum number of graph nodes to return in knowledge graph queries."""
 
+    max_source_ids_per_entity: int = field(
+        default=get_env_value("MAX_SOURCE_IDS_PER_ENTITY", DEFAULT_MAX_SOURCE_IDS_PER_ENTITY, int)
+    )
+    """Maximum number of source (chunk) ids in entity Grpah + VDB."""
+
     addon_params: dict[str, Any] = field(
         default_factory=lambda: {
             "language": get_env_value(
diff --git a/lightrag/operate.py b/lightrag/operate.py
index 0476d169..12afffa1 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -1374,7 +1374,7 @@ async def _merge_nodes_then_upsert(
 
     merged_source_ids: set = set([dp["source_id"] for dp in nodes_data] + already_source_ids)
 
-    source_ids = truncate_entity_source_id(merged_source_ids, entity_name)
+    source_ids = truncate_entity_source_id(merged_source_ids, entity_name, global_config)
     source_id = GRAPH_FIELD_SEP.join(source_ids)
 
     file_path = build_file_path(already_file_paths, nodes_data, entity_name)
@@ -1661,7 +1661,7 @@ async def merge_nodes_and_edges(
                 [entity_name], namespace=namespace, enable_logging=False
             ):
                 try:
-                    logger.info(f"Inserting {entity_name} in Graph")
+                    logger.debug(f"Inserting {entity_name} in Graph")
                     # Graph database operation (critical path, must succeed)
                     entity_data = await _merge_nodes_then_upsert(
                         entity_name,
@@ -1690,7 +1690,7 @@ async def merge_nodes_and_edges(
                         }
 
 
-                        logger.info(f"Inserting {entity_name} in Graph")
+                        logger.debug(f"Inserting {entity_name} in Graph")
                         # Use safe operation wrapper - VDB failure must throw exception
                         await safe_vdb_operation_with_exception(
                             operation=lambda: entity_vdb.upsert(data_for_vdb),
diff --git a/lightrag/utils.py b/lightrag/utils.py
index b33c5a15..cf585016 100644
--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@@ -26,7 +26,6 @@ from lightrag.constants import (
     GRAPH_FIELD_SEP,
     DEFAULT_MAX_TOTAL_TOKENS,
     DEFAULT_MAX_FILE_PATH_LENGTH,
-    DEFAULT_MAX_CHUNK_IDS_PER_ENTITY,
 )
 
 # Initialize logger with basic configuration
@@ -2465,23 +2464,25 @@ async def process_chunks_unified(
 
     return final_chunks
 
-def truncate_entity_source_id(chunk_ids: set, entity_name: str) -> set:
+def truncate_entity_source_id(chunk_ids: set, entity_name: str, global_config: dict) -> set:
     """Limit chunk_ids, for entities that appear a HUGE no of times (To not break VDB hard upper limits)"""
     already_len: int = len(chunk_ids)
 
-    max_chunk_ids_per_entity = get_env_value("MAX_CHUNK_IDS_PER_ENTITY", DEFAULT_MAX_CHUNK_IDS_PER_ENTITY, int)
+    max_chunk_ids_per_entity = global_config["max_source_ids_per_entity"]
+
+    if already_len <= max_chunk_ids_per_entity:
+        return chunk_ids
+
+    logger.warning(
+        f"Source Ids already exceeds {max_chunk_ids_per_entity } for {entity_name}, "
+        f"current size: {already_len}, truncating..."
+    )
 
-    if already_len >= max_chunk_ids_per_entity:
-        logger.warning(
-            f"Chunk Ids already exceeds {max_chunk_ids_per_entity } for {entity_name}, "
-            f"current size: {already_len} entries."
-        )
-    
     truncated_chunk_ids = set(list(chunk_ids)[0:max_chunk_ids_per_entity ])
-
     return truncated_chunk_ids
 
 
+
 def build_file_path(already_file_paths, data_list, target):
     """Build file path string with UTF-8 byte length limit and deduplication