From 7871600d8a101504f06566c87bf6bc9125206330 Mon Sep 17 00:00:00 2001 From: DivinesLight Date: Tue, 14 Oct 2025 14:47:04 +0500 Subject: [PATCH 1/3] Quick fix to limit source_id ballooning while inserting nodes --- lightrag/constants.py | 1 + lightrag/operate.py | 13 ++++++++++--- lightrag/utils.py | 15 +++++++++++++++ 3 files changed, 26 insertions(+), 3 deletions(-) diff --git a/lightrag/constants.py b/lightrag/constants.py index 14584559..6fb9feb4 100644 --- a/lightrag/constants.py +++ b/lightrag/constants.py @@ -13,6 +13,7 @@ DEFAULT_MAX_GRAPH_NODES = 1000 # Default values for extraction settings DEFAULT_SUMMARY_LANGUAGE = "English" # Default language for document processing DEFAULT_MAX_GLEANING = 1 +DEFAULT_MAX_CHUNK_IDS_PER_ENTITY = 500 # Applies to Both Graph + Vector DBs # Number of description fragments to trigger LLM summary DEFAULT_FORCE_LLM_SUMMARY_ON_MERGE = 8 diff --git a/lightrag/operate.py b/lightrag/operate.py index a12cb63f..cee8f377 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -27,6 +27,7 @@ from .utils import ( pick_by_vector_similarity, process_chunks_unified, build_file_path, + truncate_entity_source_id, safe_vdb_operation_with_exception, create_prefixed_exception, fix_tuple_delimiter_corruption, @@ -52,6 +53,7 @@ from .constants import ( DEFAULT_KG_CHUNK_PICK_METHOD, DEFAULT_ENTITY_TYPES, DEFAULT_SUMMARY_LANGUAGE, + DEFAULT_MAX_CHUNK_IDS_PER_ENTITY, ) from .kg.shared_storage import get_storage_keyed_lock import time @@ -1371,9 +1373,11 @@ async def _merge_nodes_then_upsert( logger.error(f"Entity {entity_name} has no description") description = "(no description)" - source_id = GRAPH_FIELD_SEP.join( - set([dp["source_id"] for dp in nodes_data] + already_source_ids) - ) + merged_source_ids: set = set([dp["source_id"] for dp in nodes_data] + already_source_ids) + + source_ids = truncate_entity_source_id(merged_source_ids, entity_name) + source_id = GRAPH_FIELD_SEP.join(source_ids) + file_path = build_file_path(already_file_paths, nodes_data, entity_name) node_data = dict( @@ -1658,6 +1662,7 @@ async def merge_nodes_and_edges( [entity_name], namespace=namespace, enable_logging=False ): try: + logger.info(f"Inserting {entity_name} in Graph") # Graph database operation (critical path, must succeed) entity_data = await _merge_nodes_then_upsert( entity_name, @@ -1685,6 +1690,8 @@ async def merge_nodes_and_edges( } } + + logger.info(f"Inserting {entity_name} in Graph") # Use safe operation wrapper - VDB failure must throw exception await safe_vdb_operation_with_exception( operation=lambda: entity_vdb.upsert(data_for_vdb), diff --git a/lightrag/utils.py b/lightrag/utils.py index 83a3c394..17ee43a6 100644 --- a/lightrag/utils.py +++ b/lightrag/utils.py @@ -26,6 +26,7 @@ from lightrag.constants import ( GRAPH_FIELD_SEP, DEFAULT_MAX_TOTAL_TOKENS, DEFAULT_MAX_FILE_PATH_LENGTH, + DEFAULT_MAX_CHUNK_IDS_PER_ENTITY, ) # Initialize logger with basic configuration @@ -2464,6 +2465,20 @@ async def process_chunks_unified( return final_chunks +def truncate_entity_source_id(chunk_ids: set, entity_name: str) -> set: + """Limit chunk_ids, for entities that appear a HUGE no of times (To not break VDB hard upper limits)""" + already_len: int = len(chunk_ids) + + if already_len >= DEFAULT_MAX_CHUNK_IDS_PER_ENTITY: + logger.warning( + f"Chunk Ids already exceeds {DEFAULT_MAX_CHUNK_IDS_PER_ENTITY} for {entity_name}, " + f"current size: {already_len} entries." + ) + + truncated_chunk_ids = set(list(chunk_ids)[0:DEFAULT_MAX_CHUNK_IDS_PER_ENTITY]) + + return truncated_chunk_ids + def build_file_path(already_file_paths, data_list, target): """Build file path string with UTF-8 byte length limit and deduplication From 4e740af79b538653d127708aecfc9a109d276d17 Mon Sep 17 00:00:00 2001 From: haseebuchiha Date: Tue, 14 Oct 2025 16:14:03 +0500 Subject: [PATCH 2/3] Import from env and use default if none and removed useless import --- env.example | 2 ++ lightrag/operate.py | 1 - lightrag/utils.py | 8 +++++--- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/env.example b/env.example index 4c8d355d..1d2b81f3 100644 --- a/env.example +++ b/env.example @@ -73,6 +73,8 @@ ENABLE_LLM_CACHE=true # MAX_RELATION_TOKENS=8000 ### control the maximum tokens send to LLM (include entities, relations and chunks) # MAX_TOTAL_TOKENS=30000 +### control the maximum chunk_ids stored in vector db +# MAX_CHUNK_IDS_PER_ENTITY=500 ### maximum number of related chunks per source entity or relation ### The chunk picker uses this value to determine the total number of chunks selected from KG(knowledge graph) diff --git a/lightrag/operate.py b/lightrag/operate.py index cee8f377..0476d169 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -53,7 +53,6 @@ from .constants import ( DEFAULT_KG_CHUNK_PICK_METHOD, DEFAULT_ENTITY_TYPES, DEFAULT_SUMMARY_LANGUAGE, - DEFAULT_MAX_CHUNK_IDS_PER_ENTITY, ) from .kg.shared_storage import get_storage_keyed_lock import time diff --git a/lightrag/utils.py b/lightrag/utils.py index 17ee43a6..b33c5a15 100644 --- a/lightrag/utils.py +++ b/lightrag/utils.py @@ -2469,13 +2469,15 @@ def truncate_entity_source_id(chunk_ids: set, entity_name: str) -> set: """Limit chunk_ids, for entities that appear a HUGE no of times (To not break VDB hard upper limits)""" already_len: int = len(chunk_ids) - if already_len >= DEFAULT_MAX_CHUNK_IDS_PER_ENTITY: + max_chunk_ids_per_entity = get_env_value("MAX_CHUNK_IDS_PER_ENTITY", DEFAULT_MAX_CHUNK_IDS_PER_ENTITY, int) + + if already_len >= max_chunk_ids_per_entity: logger.warning( - f"Chunk Ids already exceeds {DEFAULT_MAX_CHUNK_IDS_PER_ENTITY} for {entity_name}, " + f"Chunk Ids already exceeds {max_chunk_ids_per_entity } for {entity_name}, " f"current size: {already_len} entries." ) - truncated_chunk_ids = set(list(chunk_ids)[0:DEFAULT_MAX_CHUNK_IDS_PER_ENTITY]) + truncated_chunk_ids = set(list(chunk_ids)[0:max_chunk_ids_per_entity ]) return truncated_chunk_ids From 17c2a929d2c6aa4e5767f21e4b20348be72b3184 Mon Sep 17 00:00:00 2001 From: DivinesLight Date: Wed, 15 Oct 2025 18:24:38 +0500 Subject: [PATCH 3/3] Get max source Id config from .env and lightRAG init --- env.example | 4 ++-- lightrag/constants.py | 2 +- lightrag/lightrag.py | 6 ++++++ lightrag/operate.py | 6 +++--- lightrag/utils.py | 21 +++++++++++---------- 5 files changed, 23 insertions(+), 16 deletions(-) diff --git a/env.example b/env.example index 1d2b81f3..e0b649e3 100644 --- a/env.example +++ b/env.example @@ -73,8 +73,8 @@ ENABLE_LLM_CACHE=true # MAX_RELATION_TOKENS=8000 ### control the maximum tokens send to LLM (include entities, relations and chunks) # MAX_TOTAL_TOKENS=30000 -### control the maximum chunk_ids stored in vector db -# MAX_CHUNK_IDS_PER_ENTITY=500 +### control the maximum chunk_ids stored +# MAX_SOURCE_IDS_PER_ENTITY=500 ### maximum number of related chunks per source entity or relation ### The chunk picker uses this value to determine the total number of chunks selected from KG(knowledge graph) diff --git a/lightrag/constants.py b/lightrag/constants.py index 6fb9feb4..f7b5c41f 100644 --- a/lightrag/constants.py +++ b/lightrag/constants.py @@ -13,7 +13,7 @@ DEFAULT_MAX_GRAPH_NODES = 1000 # Default values for extraction settings DEFAULT_SUMMARY_LANGUAGE = "English" # Default language for document processing DEFAULT_MAX_GLEANING = 1 -DEFAULT_MAX_CHUNK_IDS_PER_ENTITY = 500 # Applies to Both Graph + Vector DBs +DEFAULT_MAX_SOURCE_IDS_PER_ENTITY = 500 # Applies to Both Graph + Vector DBs # Number of description fragments to trigger LLM summary DEFAULT_FORCE_LLM_SUMMARY_ON_MERGE = 8 diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index d288685e..2b18f961 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -39,6 +39,7 @@ from lightrag.constants import ( DEFAULT_MAX_ASYNC, DEFAULT_MAX_PARALLEL_INSERT, DEFAULT_MAX_GRAPH_NODES, + DEFAULT_MAX_SOURCE_IDS_PER_ENTITY, DEFAULT_ENTITY_TYPES, DEFAULT_SUMMARY_LANGUAGE, DEFAULT_LLM_TIMEOUT, @@ -359,6 +360,11 @@ class LightRAG: ) """Maximum number of graph nodes to return in knowledge graph queries.""" + max_source_ids_per_entity: int = field( + default=get_env_value("MAX_SOURCE_IDS_PER_ENTITY", DEFAULT_MAX_SOURCE_IDS_PER_ENTITY, int) + ) + """Maximum number of source (chunk) ids in entity Grpah + VDB.""" + addon_params: dict[str, Any] = field( default_factory=lambda: { "language": get_env_value( diff --git a/lightrag/operate.py b/lightrag/operate.py index 0476d169..12afffa1 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -1374,7 +1374,7 @@ async def _merge_nodes_then_upsert( merged_source_ids: set = set([dp["source_id"] for dp in nodes_data] + already_source_ids) - source_ids = truncate_entity_source_id(merged_source_ids, entity_name) + source_ids = truncate_entity_source_id(merged_source_ids, entity_name, global_config) source_id = GRAPH_FIELD_SEP.join(source_ids) file_path = build_file_path(already_file_paths, nodes_data, entity_name) @@ -1661,7 +1661,7 @@ async def merge_nodes_and_edges( [entity_name], namespace=namespace, enable_logging=False ): try: - logger.info(f"Inserting {entity_name} in Graph") + logger.debug(f"Inserting {entity_name} in Graph") # Graph database operation (critical path, must succeed) entity_data = await _merge_nodes_then_upsert( entity_name, @@ -1690,7 +1690,7 @@ async def merge_nodes_and_edges( } - logger.info(f"Inserting {entity_name} in Graph") + logger.debug(f"Inserting {entity_name} in Graph") # Use safe operation wrapper - VDB failure must throw exception await safe_vdb_operation_with_exception( operation=lambda: entity_vdb.upsert(data_for_vdb), diff --git a/lightrag/utils.py b/lightrag/utils.py index b33c5a15..cf585016 100644 --- a/lightrag/utils.py +++ b/lightrag/utils.py @@ -26,7 +26,6 @@ from lightrag.constants import ( GRAPH_FIELD_SEP, DEFAULT_MAX_TOTAL_TOKENS, DEFAULT_MAX_FILE_PATH_LENGTH, - DEFAULT_MAX_CHUNK_IDS_PER_ENTITY, ) # Initialize logger with basic configuration @@ -2465,23 +2464,25 @@ async def process_chunks_unified( return final_chunks -def truncate_entity_source_id(chunk_ids: set, entity_name: str) -> set: +def truncate_entity_source_id(chunk_ids: set, entity_name: str, global_config: dict) -> set: """Limit chunk_ids, for entities that appear a HUGE no of times (To not break VDB hard upper limits)""" already_len: int = len(chunk_ids) - max_chunk_ids_per_entity = get_env_value("MAX_CHUNK_IDS_PER_ENTITY", DEFAULT_MAX_CHUNK_IDS_PER_ENTITY, int) + max_chunk_ids_per_entity = global_config["max_source_ids_per_entity"] + + if already_len <= max_chunk_ids_per_entity: + return chunk_ids + + logger.warning( + f"Source Ids already exceeds {max_chunk_ids_per_entity } for {entity_name}, " + f"current size: {already_len}, truncating..." + ) - if already_len >= max_chunk_ids_per_entity: - logger.warning( - f"Chunk Ids already exceeds {max_chunk_ids_per_entity } for {entity_name}, " - f"current size: {already_len} entries." - ) - truncated_chunk_ids = set(list(chunk_ids)[0:max_chunk_ids_per_entity ]) - return truncated_chunk_ids + def build_file_path(already_file_paths, data_list, target): """Build file path string with UTF-8 byte length limit and deduplication