From 54f0a7d1ca09fb7297ff90d1a9962f2087a7c71a Mon Sep 17 00:00:00 2001 From: DivinesLight Date: Tue, 14 Oct 2025 14:47:04 +0500 Subject: [PATCH 01/25] Quick fix to limit source_id ballooning while inserting nodes --- lightrag/constants.py | 1 + lightrag/operate.py | 15 +++++++++++---- lightrag/utils.py | 15 +++++++++++++++ 3 files changed, 27 insertions(+), 4 deletions(-) diff --git a/lightrag/constants.py b/lightrag/constants.py index 14584559..6fb9feb4 100644 --- a/lightrag/constants.py +++ b/lightrag/constants.py @@ -13,6 +13,7 @@ DEFAULT_MAX_GRAPH_NODES = 1000 # Default values for extraction settings DEFAULT_SUMMARY_LANGUAGE = "English" # Default language for document processing DEFAULT_MAX_GLEANING = 1 +DEFAULT_MAX_CHUNK_IDS_PER_ENTITY = 500 # Applies to Both Graph + Vector DBs # Number of description fragments to trigger LLM summary DEFAULT_FORCE_LLM_SUMMARY_ON_MERGE = 8 diff --git a/lightrag/operate.py b/lightrag/operate.py index a12cb63f..29a17e68 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -27,6 +27,7 @@ from .utils import ( pick_by_vector_similarity, process_chunks_unified, build_file_path, + truncate_entity_source_id, safe_vdb_operation_with_exception, create_prefixed_exception, fix_tuple_delimiter_corruption, @@ -52,6 +53,7 @@ from .constants import ( DEFAULT_KG_CHUNK_PICK_METHOD, DEFAULT_ENTITY_TYPES, DEFAULT_SUMMARY_LANGUAGE, + DEFAULT_MAX_CHUNK_IDS_PER_ENTITY, ) from .kg.shared_storage import get_storage_keyed_lock import time @@ -1371,9 +1373,11 @@ async def _merge_nodes_then_upsert( logger.error(f"Entity {entity_name} has no description") description = "(no description)" - source_id = GRAPH_FIELD_SEP.join( - set([dp["source_id"] for dp in nodes_data] + already_source_ids) - ) + merged_source_ids: set = set([dp["source_id"] for dp in nodes_data] + already_source_ids) + + source_ids = truncate_entity_source_id(merged_source_ids, entity_name) + source_id = GRAPH_FIELD_SEP.join(source_ids) + file_path = build_file_path(already_file_paths, nodes_data, entity_name) node_data = dict( @@ -1658,6 +1662,7 @@ async def merge_nodes_and_edges( [entity_name], namespace=namespace, enable_logging=False ): try: + logger.info(f"Inserting {entity_name} in Graph") # Graph database operation (critical path, must succeed) entity_data = await _merge_nodes_then_upsert( entity_name, @@ -1673,7 +1678,7 @@ async def merge_nodes_and_edges( if entity_vdb is not None and entity_data: data_for_vdb = { compute_mdhash_id( - entity_data["entity_name"], prefix="ent-" + str(entity_data["entity_name"]), prefix="ent-" ): { "entity_name": entity_data["entity_name"], "entity_type": entity_data["entity_type"], @@ -1685,6 +1690,8 @@ async def merge_nodes_and_edges( } } + + logger.info(f"Inserting {entity_name} in Graph") # Use safe operation wrapper - VDB failure must throw exception await safe_vdb_operation_with_exception( operation=lambda: entity_vdb.upsert(data_for_vdb), diff --git a/lightrag/utils.py b/lightrag/utils.py index 83a3c394..17ee43a6 100644 --- a/lightrag/utils.py +++ b/lightrag/utils.py @@ -26,6 +26,7 @@ from lightrag.constants import ( GRAPH_FIELD_SEP, DEFAULT_MAX_TOTAL_TOKENS, DEFAULT_MAX_FILE_PATH_LENGTH, + DEFAULT_MAX_CHUNK_IDS_PER_ENTITY, ) # Initialize logger with basic configuration @@ -2464,6 +2465,20 @@ async def process_chunks_unified( return final_chunks +def truncate_entity_source_id(chunk_ids: set, entity_name: str) -> set: + """Limit chunk_ids, for entities that appear a HUGE no of times (To not break VDB hard upper limits)""" + already_len: int = len(chunk_ids) + + if already_len >= DEFAULT_MAX_CHUNK_IDS_PER_ENTITY: + logger.warning( + f"Chunk Ids already exceeds {DEFAULT_MAX_CHUNK_IDS_PER_ENTITY} for {entity_name}, " + f"current size: {already_len} entries." + ) + + truncated_chunk_ids = set(list(chunk_ids)[0:DEFAULT_MAX_CHUNK_IDS_PER_ENTITY]) + + return truncated_chunk_ids + def build_file_path(already_file_paths, data_list, target): """Build file path string with UTF-8 byte length limit and deduplication From d52c3377b4cb1d3be0ed13e8e7a7119f2c73d348 Mon Sep 17 00:00:00 2001 From: haseebuchiha Date: Tue, 14 Oct 2025 16:14:03 +0500 Subject: [PATCH 02/25] Import from env and use default if none and removed useless import --- env.example | 2 ++ lightrag/operate.py | 1 - lightrag/utils.py | 8 +++++--- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/env.example b/env.example index 4c8d355d..1d2b81f3 100644 --- a/env.example +++ b/env.example @@ -73,6 +73,8 @@ ENABLE_LLM_CACHE=true # MAX_RELATION_TOKENS=8000 ### control the maximum tokens send to LLM (include entities, relations and chunks) # MAX_TOTAL_TOKENS=30000 +### control the maximum chunk_ids stored in vector db +# MAX_CHUNK_IDS_PER_ENTITY=500 ### maximum number of related chunks per source entity or relation ### The chunk picker uses this value to determine the total number of chunks selected from KG(knowledge graph) diff --git a/lightrag/operate.py b/lightrag/operate.py index 29a17e68..34a8a613 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -53,7 +53,6 @@ from .constants import ( DEFAULT_KG_CHUNK_PICK_METHOD, DEFAULT_ENTITY_TYPES, DEFAULT_SUMMARY_LANGUAGE, - DEFAULT_MAX_CHUNK_IDS_PER_ENTITY, ) from .kg.shared_storage import get_storage_keyed_lock import time diff --git a/lightrag/utils.py b/lightrag/utils.py index 17ee43a6..b33c5a15 100644 --- a/lightrag/utils.py +++ b/lightrag/utils.py @@ -2469,13 +2469,15 @@ def truncate_entity_source_id(chunk_ids: set, entity_name: str) -> set: """Limit chunk_ids, for entities that appear a HUGE no of times (To not break VDB hard upper limits)""" already_len: int = len(chunk_ids) - if already_len >= DEFAULT_MAX_CHUNK_IDS_PER_ENTITY: + max_chunk_ids_per_entity = get_env_value("MAX_CHUNK_IDS_PER_ENTITY", DEFAULT_MAX_CHUNK_IDS_PER_ENTITY, int) + + if already_len >= max_chunk_ids_per_entity: logger.warning( - f"Chunk Ids already exceeds {DEFAULT_MAX_CHUNK_IDS_PER_ENTITY} for {entity_name}, " + f"Chunk Ids already exceeds {max_chunk_ids_per_entity } for {entity_name}, " f"current size: {already_len} entries." ) - truncated_chunk_ids = set(list(chunk_ids)[0:DEFAULT_MAX_CHUNK_IDS_PER_ENTITY]) + truncated_chunk_ids = set(list(chunk_ids)[0:max_chunk_ids_per_entity ]) return truncated_chunk_ids From c06522b927da81e68898f94c45fa918458d851df Mon Sep 17 00:00:00 2001 From: DivinesLight Date: Wed, 15 Oct 2025 18:24:38 +0500 Subject: [PATCH 03/25] Get max source Id config from .env and lightRAG init --- env.example | 4 ++-- lightrag/constants.py | 2 +- lightrag/lightrag.py | 6 ++++++ lightrag/operate.py | 6 +++--- lightrag/utils.py | 21 +++++++++++---------- 5 files changed, 23 insertions(+), 16 deletions(-) diff --git a/env.example b/env.example index 1d2b81f3..e0b649e3 100644 --- a/env.example +++ b/env.example @@ -73,8 +73,8 @@ ENABLE_LLM_CACHE=true # MAX_RELATION_TOKENS=8000 ### control the maximum tokens send to LLM (include entities, relations and chunks) # MAX_TOTAL_TOKENS=30000 -### control the maximum chunk_ids stored in vector db -# MAX_CHUNK_IDS_PER_ENTITY=500 +### control the maximum chunk_ids stored +# MAX_SOURCE_IDS_PER_ENTITY=500 ### maximum number of related chunks per source entity or relation ### The chunk picker uses this value to determine the total number of chunks selected from KG(knowledge graph) diff --git a/lightrag/constants.py b/lightrag/constants.py index 6fb9feb4..f7b5c41f 100644 --- a/lightrag/constants.py +++ b/lightrag/constants.py @@ -13,7 +13,7 @@ DEFAULT_MAX_GRAPH_NODES = 1000 # Default values for extraction settings DEFAULT_SUMMARY_LANGUAGE = "English" # Default language for document processing DEFAULT_MAX_GLEANING = 1 -DEFAULT_MAX_CHUNK_IDS_PER_ENTITY = 500 # Applies to Both Graph + Vector DBs +DEFAULT_MAX_SOURCE_IDS_PER_ENTITY = 500 # Applies to Both Graph + Vector DBs # Number of description fragments to trigger LLM summary DEFAULT_FORCE_LLM_SUMMARY_ON_MERGE = 8 diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index d288685e..2b18f961 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -39,6 +39,7 @@ from lightrag.constants import ( DEFAULT_MAX_ASYNC, DEFAULT_MAX_PARALLEL_INSERT, DEFAULT_MAX_GRAPH_NODES, + DEFAULT_MAX_SOURCE_IDS_PER_ENTITY, DEFAULT_ENTITY_TYPES, DEFAULT_SUMMARY_LANGUAGE, DEFAULT_LLM_TIMEOUT, @@ -359,6 +360,11 @@ class LightRAG: ) """Maximum number of graph nodes to return in knowledge graph queries.""" + max_source_ids_per_entity: int = field( + default=get_env_value("MAX_SOURCE_IDS_PER_ENTITY", DEFAULT_MAX_SOURCE_IDS_PER_ENTITY, int) + ) + """Maximum number of source (chunk) ids in entity Grpah + VDB.""" + addon_params: dict[str, Any] = field( default_factory=lambda: { "language": get_env_value( diff --git a/lightrag/operate.py b/lightrag/operate.py index 34a8a613..7a8b6391 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -1374,7 +1374,7 @@ async def _merge_nodes_then_upsert( merged_source_ids: set = set([dp["source_id"] for dp in nodes_data] + already_source_ids) - source_ids = truncate_entity_source_id(merged_source_ids, entity_name) + source_ids = truncate_entity_source_id(merged_source_ids, entity_name, global_config) source_id = GRAPH_FIELD_SEP.join(source_ids) file_path = build_file_path(already_file_paths, nodes_data, entity_name) @@ -1661,7 +1661,7 @@ async def merge_nodes_and_edges( [entity_name], namespace=namespace, enable_logging=False ): try: - logger.info(f"Inserting {entity_name} in Graph") + logger.debug(f"Inserting {entity_name} in Graph") # Graph database operation (critical path, must succeed) entity_data = await _merge_nodes_then_upsert( entity_name, @@ -1690,7 +1690,7 @@ async def merge_nodes_and_edges( } - logger.info(f"Inserting {entity_name} in Graph") + logger.debug(f"Inserting {entity_name} in Graph") # Use safe operation wrapper - VDB failure must throw exception await safe_vdb_operation_with_exception( operation=lambda: entity_vdb.upsert(data_for_vdb), diff --git a/lightrag/utils.py b/lightrag/utils.py index b33c5a15..cf585016 100644 --- a/lightrag/utils.py +++ b/lightrag/utils.py @@ -26,7 +26,6 @@ from lightrag.constants import ( GRAPH_FIELD_SEP, DEFAULT_MAX_TOTAL_TOKENS, DEFAULT_MAX_FILE_PATH_LENGTH, - DEFAULT_MAX_CHUNK_IDS_PER_ENTITY, ) # Initialize logger with basic configuration @@ -2465,23 +2464,25 @@ async def process_chunks_unified( return final_chunks -def truncate_entity_source_id(chunk_ids: set, entity_name: str) -> set: +def truncate_entity_source_id(chunk_ids: set, entity_name: str, global_config: dict) -> set: """Limit chunk_ids, for entities that appear a HUGE no of times (To not break VDB hard upper limits)""" already_len: int = len(chunk_ids) - max_chunk_ids_per_entity = get_env_value("MAX_CHUNK_IDS_PER_ENTITY", DEFAULT_MAX_CHUNK_IDS_PER_ENTITY, int) + max_chunk_ids_per_entity = global_config["max_source_ids_per_entity"] + + if already_len <= max_chunk_ids_per_entity: + return chunk_ids + + logger.warning( + f"Source Ids already exceeds {max_chunk_ids_per_entity } for {entity_name}, " + f"current size: {already_len}, truncating..." + ) - if already_len >= max_chunk_ids_per_entity: - logger.warning( - f"Chunk Ids already exceeds {max_chunk_ids_per_entity } for {entity_name}, " - f"current size: {already_len} entries." - ) - truncated_chunk_ids = set(list(chunk_ids)[0:max_chunk_ids_per_entity ]) - return truncated_chunk_ids + def build_file_path(already_file_paths, data_list, target): """Build file path string with UTF-8 byte length limit and deduplication From 7871600d8a101504f06566c87bf6bc9125206330 Mon Sep 17 00:00:00 2001 From: DivinesLight Date: Tue, 14 Oct 2025 14:47:04 +0500 Subject: [PATCH 04/25] Quick fix to limit source_id ballooning while inserting nodes --- lightrag/constants.py | 1 + lightrag/operate.py | 13 ++++++++++--- lightrag/utils.py | 15 +++++++++++++++ 3 files changed, 26 insertions(+), 3 deletions(-) diff --git a/lightrag/constants.py b/lightrag/constants.py index 14584559..6fb9feb4 100644 --- a/lightrag/constants.py +++ b/lightrag/constants.py @@ -13,6 +13,7 @@ DEFAULT_MAX_GRAPH_NODES = 1000 # Default values for extraction settings DEFAULT_SUMMARY_LANGUAGE = "English" # Default language for document processing DEFAULT_MAX_GLEANING = 1 +DEFAULT_MAX_CHUNK_IDS_PER_ENTITY = 500 # Applies to Both Graph + Vector DBs # Number of description fragments to trigger LLM summary DEFAULT_FORCE_LLM_SUMMARY_ON_MERGE = 8 diff --git a/lightrag/operate.py b/lightrag/operate.py index a12cb63f..cee8f377 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -27,6 +27,7 @@ from .utils import ( pick_by_vector_similarity, process_chunks_unified, build_file_path, + truncate_entity_source_id, safe_vdb_operation_with_exception, create_prefixed_exception, fix_tuple_delimiter_corruption, @@ -52,6 +53,7 @@ from .constants import ( DEFAULT_KG_CHUNK_PICK_METHOD, DEFAULT_ENTITY_TYPES, DEFAULT_SUMMARY_LANGUAGE, + DEFAULT_MAX_CHUNK_IDS_PER_ENTITY, ) from .kg.shared_storage import get_storage_keyed_lock import time @@ -1371,9 +1373,11 @@ async def _merge_nodes_then_upsert( logger.error(f"Entity {entity_name} has no description") description = "(no description)" - source_id = GRAPH_FIELD_SEP.join( - set([dp["source_id"] for dp in nodes_data] + already_source_ids) - ) + merged_source_ids: set = set([dp["source_id"] for dp in nodes_data] + already_source_ids) + + source_ids = truncate_entity_source_id(merged_source_ids, entity_name) + source_id = GRAPH_FIELD_SEP.join(source_ids) + file_path = build_file_path(already_file_paths, nodes_data, entity_name) node_data = dict( @@ -1658,6 +1662,7 @@ async def merge_nodes_and_edges( [entity_name], namespace=namespace, enable_logging=False ): try: + logger.info(f"Inserting {entity_name} in Graph") # Graph database operation (critical path, must succeed) entity_data = await _merge_nodes_then_upsert( entity_name, @@ -1685,6 +1690,8 @@ async def merge_nodes_and_edges( } } + + logger.info(f"Inserting {entity_name} in Graph") # Use safe operation wrapper - VDB failure must throw exception await safe_vdb_operation_with_exception( operation=lambda: entity_vdb.upsert(data_for_vdb), diff --git a/lightrag/utils.py b/lightrag/utils.py index 83a3c394..17ee43a6 100644 --- a/lightrag/utils.py +++ b/lightrag/utils.py @@ -26,6 +26,7 @@ from lightrag.constants import ( GRAPH_FIELD_SEP, DEFAULT_MAX_TOTAL_TOKENS, DEFAULT_MAX_FILE_PATH_LENGTH, + DEFAULT_MAX_CHUNK_IDS_PER_ENTITY, ) # Initialize logger with basic configuration @@ -2464,6 +2465,20 @@ async def process_chunks_unified( return final_chunks +def truncate_entity_source_id(chunk_ids: set, entity_name: str) -> set: + """Limit chunk_ids, for entities that appear a HUGE no of times (To not break VDB hard upper limits)""" + already_len: int = len(chunk_ids) + + if already_len >= DEFAULT_MAX_CHUNK_IDS_PER_ENTITY: + logger.warning( + f"Chunk Ids already exceeds {DEFAULT_MAX_CHUNK_IDS_PER_ENTITY} for {entity_name}, " + f"current size: {already_len} entries." + ) + + truncated_chunk_ids = set(list(chunk_ids)[0:DEFAULT_MAX_CHUNK_IDS_PER_ENTITY]) + + return truncated_chunk_ids + def build_file_path(already_file_paths, data_list, target): """Build file path string with UTF-8 byte length limit and deduplication From 4e740af79b538653d127708aecfc9a109d276d17 Mon Sep 17 00:00:00 2001 From: haseebuchiha Date: Tue, 14 Oct 2025 16:14:03 +0500 Subject: [PATCH 05/25] Import from env and use default if none and removed useless import --- env.example | 2 ++ lightrag/operate.py | 1 - lightrag/utils.py | 8 +++++--- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/env.example b/env.example index 4c8d355d..1d2b81f3 100644 --- a/env.example +++ b/env.example @@ -73,6 +73,8 @@ ENABLE_LLM_CACHE=true # MAX_RELATION_TOKENS=8000 ### control the maximum tokens send to LLM (include entities, relations and chunks) # MAX_TOTAL_TOKENS=30000 +### control the maximum chunk_ids stored in vector db +# MAX_CHUNK_IDS_PER_ENTITY=500 ### maximum number of related chunks per source entity or relation ### The chunk picker uses this value to determine the total number of chunks selected from KG(knowledge graph) diff --git a/lightrag/operate.py b/lightrag/operate.py index cee8f377..0476d169 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -53,7 +53,6 @@ from .constants import ( DEFAULT_KG_CHUNK_PICK_METHOD, DEFAULT_ENTITY_TYPES, DEFAULT_SUMMARY_LANGUAGE, - DEFAULT_MAX_CHUNK_IDS_PER_ENTITY, ) from .kg.shared_storage import get_storage_keyed_lock import time diff --git a/lightrag/utils.py b/lightrag/utils.py index 17ee43a6..b33c5a15 100644 --- a/lightrag/utils.py +++ b/lightrag/utils.py @@ -2469,13 +2469,15 @@ def truncate_entity_source_id(chunk_ids: set, entity_name: str) -> set: """Limit chunk_ids, for entities that appear a HUGE no of times (To not break VDB hard upper limits)""" already_len: int = len(chunk_ids) - if already_len >= DEFAULT_MAX_CHUNK_IDS_PER_ENTITY: + max_chunk_ids_per_entity = get_env_value("MAX_CHUNK_IDS_PER_ENTITY", DEFAULT_MAX_CHUNK_IDS_PER_ENTITY, int) + + if already_len >= max_chunk_ids_per_entity: logger.warning( - f"Chunk Ids already exceeds {DEFAULT_MAX_CHUNK_IDS_PER_ENTITY} for {entity_name}, " + f"Chunk Ids already exceeds {max_chunk_ids_per_entity } for {entity_name}, " f"current size: {already_len} entries." ) - truncated_chunk_ids = set(list(chunk_ids)[0:DEFAULT_MAX_CHUNK_IDS_PER_ENTITY]) + truncated_chunk_ids = set(list(chunk_ids)[0:max_chunk_ids_per_entity ]) return truncated_chunk_ids From 17c2a929d2c6aa4e5767f21e4b20348be72b3184 Mon Sep 17 00:00:00 2001 From: DivinesLight Date: Wed, 15 Oct 2025 18:24:38 +0500 Subject: [PATCH 06/25] Get max source Id config from .env and lightRAG init --- env.example | 4 ++-- lightrag/constants.py | 2 +- lightrag/lightrag.py | 6 ++++++ lightrag/operate.py | 6 +++--- lightrag/utils.py | 21 +++++++++++---------- 5 files changed, 23 insertions(+), 16 deletions(-) diff --git a/env.example b/env.example index 1d2b81f3..e0b649e3 100644 --- a/env.example +++ b/env.example @@ -73,8 +73,8 @@ ENABLE_LLM_CACHE=true # MAX_RELATION_TOKENS=8000 ### control the maximum tokens send to LLM (include entities, relations and chunks) # MAX_TOTAL_TOKENS=30000 -### control the maximum chunk_ids stored in vector db -# MAX_CHUNK_IDS_PER_ENTITY=500 +### control the maximum chunk_ids stored +# MAX_SOURCE_IDS_PER_ENTITY=500 ### maximum number of related chunks per source entity or relation ### The chunk picker uses this value to determine the total number of chunks selected from KG(knowledge graph) diff --git a/lightrag/constants.py b/lightrag/constants.py index 6fb9feb4..f7b5c41f 100644 --- a/lightrag/constants.py +++ b/lightrag/constants.py @@ -13,7 +13,7 @@ DEFAULT_MAX_GRAPH_NODES = 1000 # Default values for extraction settings DEFAULT_SUMMARY_LANGUAGE = "English" # Default language for document processing DEFAULT_MAX_GLEANING = 1 -DEFAULT_MAX_CHUNK_IDS_PER_ENTITY = 500 # Applies to Both Graph + Vector DBs +DEFAULT_MAX_SOURCE_IDS_PER_ENTITY = 500 # Applies to Both Graph + Vector DBs # Number of description fragments to trigger LLM summary DEFAULT_FORCE_LLM_SUMMARY_ON_MERGE = 8 diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index d288685e..2b18f961 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -39,6 +39,7 @@ from lightrag.constants import ( DEFAULT_MAX_ASYNC, DEFAULT_MAX_PARALLEL_INSERT, DEFAULT_MAX_GRAPH_NODES, + DEFAULT_MAX_SOURCE_IDS_PER_ENTITY, DEFAULT_ENTITY_TYPES, DEFAULT_SUMMARY_LANGUAGE, DEFAULT_LLM_TIMEOUT, @@ -359,6 +360,11 @@ class LightRAG: ) """Maximum number of graph nodes to return in knowledge graph queries.""" + max_source_ids_per_entity: int = field( + default=get_env_value("MAX_SOURCE_IDS_PER_ENTITY", DEFAULT_MAX_SOURCE_IDS_PER_ENTITY, int) + ) + """Maximum number of source (chunk) ids in entity Grpah + VDB.""" + addon_params: dict[str, Any] = field( default_factory=lambda: { "language": get_env_value( diff --git a/lightrag/operate.py b/lightrag/operate.py index 0476d169..12afffa1 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -1374,7 +1374,7 @@ async def _merge_nodes_then_upsert( merged_source_ids: set = set([dp["source_id"] for dp in nodes_data] + already_source_ids) - source_ids = truncate_entity_source_id(merged_source_ids, entity_name) + source_ids = truncate_entity_source_id(merged_source_ids, entity_name, global_config) source_id = GRAPH_FIELD_SEP.join(source_ids) file_path = build_file_path(already_file_paths, nodes_data, entity_name) @@ -1661,7 +1661,7 @@ async def merge_nodes_and_edges( [entity_name], namespace=namespace, enable_logging=False ): try: - logger.info(f"Inserting {entity_name} in Graph") + logger.debug(f"Inserting {entity_name} in Graph") # Graph database operation (critical path, must succeed) entity_data = await _merge_nodes_then_upsert( entity_name, @@ -1690,7 +1690,7 @@ async def merge_nodes_and_edges( } - logger.info(f"Inserting {entity_name} in Graph") + logger.debug(f"Inserting {entity_name} in Graph") # Use safe operation wrapper - VDB failure must throw exception await safe_vdb_operation_with_exception( operation=lambda: entity_vdb.upsert(data_for_vdb), diff --git a/lightrag/utils.py b/lightrag/utils.py index b33c5a15..cf585016 100644 --- a/lightrag/utils.py +++ b/lightrag/utils.py @@ -26,7 +26,6 @@ from lightrag.constants import ( GRAPH_FIELD_SEP, DEFAULT_MAX_TOTAL_TOKENS, DEFAULT_MAX_FILE_PATH_LENGTH, - DEFAULT_MAX_CHUNK_IDS_PER_ENTITY, ) # Initialize logger with basic configuration @@ -2465,23 +2464,25 @@ async def process_chunks_unified( return final_chunks -def truncate_entity_source_id(chunk_ids: set, entity_name: str) -> set: +def truncate_entity_source_id(chunk_ids: set, entity_name: str, global_config: dict) -> set: """Limit chunk_ids, for entities that appear a HUGE no of times (To not break VDB hard upper limits)""" already_len: int = len(chunk_ids) - max_chunk_ids_per_entity = get_env_value("MAX_CHUNK_IDS_PER_ENTITY", DEFAULT_MAX_CHUNK_IDS_PER_ENTITY, int) + max_chunk_ids_per_entity = global_config["max_source_ids_per_entity"] + + if already_len <= max_chunk_ids_per_entity: + return chunk_ids + + logger.warning( + f"Source Ids already exceeds {max_chunk_ids_per_entity } for {entity_name}, " + f"current size: {already_len}, truncating..." + ) - if already_len >= max_chunk_ids_per_entity: - logger.warning( - f"Chunk Ids already exceeds {max_chunk_ids_per_entity } for {entity_name}, " - f"current size: {already_len} entries." - ) - truncated_chunk_ids = set(list(chunk_ids)[0:max_chunk_ids_per_entity ]) - return truncated_chunk_ids + def build_file_path(already_file_paths, data_list, target): """Build file path string with UTF-8 byte length limit and deduplication From dc62c78f981f67fa9f7e895b7b401035c788271f Mon Sep 17 00:00:00 2001 From: yangdx Date: Mon, 20 Oct 2025 15:24:15 +0800 Subject: [PATCH 07/25] Add entity/relation chunk tracking with configurable source ID limits - Add entity_chunks & relation_chunks storage - Implement KEEP/FIFO limit strategies - Update env.example with new settings - Add migration for chunk tracking data - Support all KV storage --- env.example | 7 +- lightrag/api/routers/document_routes.py | 2 + lightrag/base.py | 8 + lightrag/constants.py | 11 +- lightrag/kg/json_doc_status_impl.py | 14 + lightrag/kg/json_kv_impl.py | 29 +- lightrag/kg/mongo_impl.py | 44 ++- lightrag/kg/postgres_impl.py | 303 +++++++++++------ lightrag/kg/redis_impl.py | 80 ++--- lightrag/lightrag.py | 338 +++++++++++++++++-- lightrag/namespace.py | 2 + lightrag/operate.py | 423 ++++++++++++++++++++---- lightrag/utils.py | 127 ++++++- 13 files changed, 1098 insertions(+), 290 deletions(-) diff --git a/env.example b/env.example index b08f1758..6d53c390 100644 --- a/env.example +++ b/env.example @@ -73,8 +73,11 @@ ENABLE_LLM_CACHE=true # MAX_RELATION_TOKENS=8000 ### control the maximum tokens send to LLM (include entities, relations and chunks) # MAX_TOTAL_TOKENS=30000 -### control the maximum chunk_ids stored -# MAX_SOURCE_IDS_PER_ENTITY=500 +### control the maximum chunk_ids stored in vector and graph db +# MAX_SOURCE_IDS_PER_ENTITY=300 +# MAX_SOURCE_IDS_PER_RELATION=300 +### control chunk_ids limitation method: KEEP, FIFO (KEPP: Ingore New Chunks, FIFO: New chunks replace old chunks) +# SOURCE_IDS_LIMIT_METHOD=KEEP ### maximum number of related chunks per source entity or relation ### The chunk picker uses this value to determine the total number of chunks selected from KG(knowledge graph) diff --git a/lightrag/api/routers/document_routes.py b/lightrag/api/routers/document_routes.py index 7dc2c34c..0ed5a711 100644 --- a/lightrag/api/routers/document_routes.py +++ b/lightrag/api/routers/document_routes.py @@ -2003,6 +2003,8 @@ def create_document_routes( rag.full_docs, rag.full_entities, rag.full_relations, + rag.entity_chunks, + rag.relation_chunks, rag.entities_vdb, rag.relationships_vdb, rag.chunks_vdb, diff --git a/lightrag/base.py b/lightrag/base.py index 45c5cb2c..e569de2a 100644 --- a/lightrag/base.py +++ b/lightrag/base.py @@ -355,6 +355,14 @@ class BaseKVStorage(StorageNameSpace, ABC): None """ + @abstractmethod + async def is_empty(self) -> bool: + """Check if the storage is empty + + Returns: + bool: True if storage contains no data, False otherwise + """ + @dataclass class BaseGraphStorage(StorageNameSpace, ABC): diff --git a/lightrag/constants.py b/lightrag/constants.py index f7b5c41f..e374a991 100644 --- a/lightrag/constants.py +++ b/lightrag/constants.py @@ -13,7 +13,16 @@ DEFAULT_MAX_GRAPH_NODES = 1000 # Default values for extraction settings DEFAULT_SUMMARY_LANGUAGE = "English" # Default language for document processing DEFAULT_MAX_GLEANING = 1 -DEFAULT_MAX_SOURCE_IDS_PER_ENTITY = 500 # Applies to Both Graph + Vector DBs + +DEFAULT_MAX_SOURCE_IDS_PER_ENTITY = 3 +DEFAULT_MAX_SOURCE_IDS_PER_RELATION = 3 +SOURCE_IDS_LIMIT_METHOD_KEEP = "KEEP" +SOURCE_IDS_LIMIT_METHOD_FIFO = "FIFO" +DEFAULT_SOURCE_IDS_LIMIT_METHOD = SOURCE_IDS_LIMIT_METHOD_KEEP +VALID_SOURCE_IDS_LIMIT_METHODS = { + SOURCE_IDS_LIMIT_METHOD_KEEP, + SOURCE_IDS_LIMIT_METHOD_FIFO, +} # Number of description fragments to trigger LLM summary DEFAULT_FORCE_LLM_SUMMARY_ON_MERGE = 8 diff --git a/lightrag/kg/json_doc_status_impl.py b/lightrag/kg/json_doc_status_impl.py index e6d101a7..014499f2 100644 --- a/lightrag/kg/json_doc_status_impl.py +++ b/lightrag/kg/json_doc_status_impl.py @@ -187,6 +187,20 @@ class JsonDocStatusStorage(DocStatusStorage): await self.index_done_callback() + async def is_empty(self) -> bool: + """Check if the storage is empty + + Returns: + bool: True if storage is empty, False otherwise + + Raises: + StorageNotInitializedError: If storage is not initialized + """ + if self._storage_lock is None: + raise StorageNotInitializedError("JsonDocStatusStorage") + async with self._storage_lock: + return len(self._data) == 0 + async def get_by_id(self, id: str) -> Union[dict[str, Any], None]: async with self._storage_lock: return self._data.get(id) diff --git a/lightrag/kg/json_kv_impl.py b/lightrag/kg/json_kv_impl.py index 553ba417..fd016b14 100644 --- a/lightrag/kg/json_kv_impl.py +++ b/lightrag/kg/json_kv_impl.py @@ -84,26 +84,6 @@ class JsonKVStorage(BaseKVStorage): write_json(data_dict, self._file_name) await clear_all_update_flags(self.final_namespace) - async def get_all(self) -> dict[str, Any]: - """Get all data from storage - - Returns: - Dictionary containing all stored data - """ - async with self._storage_lock: - result = {} - for key, value in self._data.items(): - if value: - # Create a copy to avoid modifying the original data - data = dict(value) - # Ensure time fields are present, provide default values for old data - data.setdefault("create_time", 0) - data.setdefault("update_time", 0) - result[key] = data - else: - result[key] = value - return result - async def get_by_id(self, id: str) -> dict[str, Any] | None: async with self._storage_lock: result = self._data.get(id) @@ -200,6 +180,15 @@ class JsonKVStorage(BaseKVStorage): if any_deleted: await set_all_update_flags(self.final_namespace) + async def is_empty(self) -> bool: + """Check if the storage is empty + + Returns: + bool: True if storage contains no data, False otherwise + """ + async with self._storage_lock: + return len(self._data) == 0 + async def drop(self) -> dict[str, str]: """Drop all data from storage and clean up resources This action will persistent the data to disk immediately. diff --git a/lightrag/kg/mongo_impl.py b/lightrag/kg/mongo_impl.py index a62c3031..e55062f1 100644 --- a/lightrag/kg/mongo_impl.py +++ b/lightrag/kg/mongo_impl.py @@ -175,22 +175,6 @@ class MongoKVStorage(BaseKVStorage): existing_ids = {str(x["_id"]) async for x in cursor} return keys - existing_ids - async def get_all(self) -> dict[str, Any]: - """Get all data from storage - - Returns: - Dictionary containing all stored data - """ - cursor = self._data.find({}) - result = {} - async for doc in cursor: - doc_id = doc.pop("_id") - # Ensure time fields are present for all documents - doc.setdefault("create_time", 0) - doc.setdefault("update_time", 0) - result[doc_id] = doc - return result - async def upsert(self, data: dict[str, dict[str, Any]]) -> None: logger.debug(f"[{self.workspace}] Inserting {len(data)} to {self.namespace}") if not data: @@ -236,6 +220,20 @@ class MongoKVStorage(BaseKVStorage): # Mongo handles persistence automatically pass + async def is_empty(self) -> bool: + """Check if the storage is empty for the current workspace and namespace + + Returns: + bool: True if storage is empty, False otherwise + """ + try: + # Use count_documents with limit 1 for efficiency + count = await self._data.count_documents({}, limit=1) + return count == 0 + except PyMongoError as e: + logger.error(f"[{self.workspace}] Error checking if storage is empty: {e}") + return True + async def delete(self, ids: list[str]) -> None: """Delete documents with specified IDs @@ -466,6 +464,20 @@ class MongoDocStatusStorage(DocStatusStorage): # Mongo handles persistence automatically pass + async def is_empty(self) -> bool: + """Check if the storage is empty for the current workspace and namespace + + Returns: + bool: True if storage is empty, False otherwise + """ + try: + # Use count_documents with limit 1 for efficiency + count = await self._data.count_documents({}, limit=1) + return count == 0 + except PyMongoError as e: + logger.error(f"[{self.workspace}] Error checking if storage is empty: {e}") + return True + async def drop(self) -> dict[str, str]: """Drop the storage by removing all documents in the collection. diff --git a/lightrag/kg/postgres_impl.py b/lightrag/kg/postgres_impl.py index 54bdf0f6..3899fa20 100644 --- a/lightrag/kg/postgres_impl.py +++ b/lightrag/kg/postgres_impl.py @@ -1656,113 +1656,6 @@ class PGKVStorage(BaseKVStorage): self.db = None ################ QUERY METHODS ################ - async def get_all(self) -> dict[str, Any]: - """Get all data from storage - - Returns: - Dictionary containing all stored data - """ - table_name = namespace_to_table_name(self.namespace) - if not table_name: - logger.error( - f"[{self.workspace}] Unknown namespace for get_all: {self.namespace}" - ) - return {} - - sql = f"SELECT * FROM {table_name} WHERE workspace=$1" - params = {"workspace": self.workspace} - - try: - results = await self.db.query(sql, list(params.values()), multirows=True) - - # Special handling for LLM cache to ensure compatibility with _get_cached_extraction_results - if is_namespace(self.namespace, NameSpace.KV_STORE_LLM_RESPONSE_CACHE): - processed_results = {} - for row in results: - create_time = row.get("create_time", 0) - update_time = row.get("update_time", 0) - # Map field names and add cache_type for compatibility - processed_row = { - **row, - "return": row.get("return_value", ""), - "cache_type": row.get("original_prompt", "unknow"), - "original_prompt": row.get("original_prompt", ""), - "chunk_id": row.get("chunk_id"), - "mode": row.get("mode", "default"), - "create_time": create_time, - "update_time": create_time if update_time == 0 else update_time, - } - processed_results[row["id"]] = processed_row - return processed_results - - # For text_chunks namespace, parse llm_cache_list JSON string back to list - if is_namespace(self.namespace, NameSpace.KV_STORE_TEXT_CHUNKS): - processed_results = {} - for row in results: - llm_cache_list = row.get("llm_cache_list", []) - if isinstance(llm_cache_list, str): - try: - llm_cache_list = json.loads(llm_cache_list) - except json.JSONDecodeError: - llm_cache_list = [] - row["llm_cache_list"] = llm_cache_list - create_time = row.get("create_time", 0) - update_time = row.get("update_time", 0) - row["create_time"] = create_time - row["update_time"] = ( - create_time if update_time == 0 else update_time - ) - processed_results[row["id"]] = row - return processed_results - - # For FULL_ENTITIES namespace, parse entity_names JSON string back to list - if is_namespace(self.namespace, NameSpace.KV_STORE_FULL_ENTITIES): - processed_results = {} - for row in results: - entity_names = row.get("entity_names", []) - if isinstance(entity_names, str): - try: - entity_names = json.loads(entity_names) - except json.JSONDecodeError: - entity_names = [] - row["entity_names"] = entity_names - create_time = row.get("create_time", 0) - update_time = row.get("update_time", 0) - row["create_time"] = create_time - row["update_time"] = ( - create_time if update_time == 0 else update_time - ) - processed_results[row["id"]] = row - return processed_results - - # For FULL_RELATIONS namespace, parse relation_pairs JSON string back to list - if is_namespace(self.namespace, NameSpace.KV_STORE_FULL_RELATIONS): - processed_results = {} - for row in results: - relation_pairs = row.get("relation_pairs", []) - if isinstance(relation_pairs, str): - try: - relation_pairs = json.loads(relation_pairs) - except json.JSONDecodeError: - relation_pairs = [] - row["relation_pairs"] = relation_pairs - create_time = row.get("create_time", 0) - update_time = row.get("update_time", 0) - row["create_time"] = create_time - row["update_time"] = ( - create_time if update_time == 0 else update_time - ) - processed_results[row["id"]] = row - return processed_results - - # For other namespaces, return as-is - return {row["id"]: row for row in results} - except Exception as e: - logger.error( - f"[{self.workspace}] Error retrieving all data from {self.namespace}: {e}" - ) - return {} - async def get_by_id(self, id: str) -> dict[str, Any] | None: """Get data by id.""" sql = SQL_TEMPLATES["get_by_id_" + self.namespace] @@ -1838,6 +1731,38 @@ class PGKVStorage(BaseKVStorage): response["create_time"] = create_time response["update_time"] = create_time if update_time == 0 else update_time + # Special handling for ENTITY_CHUNKS namespace + if response and is_namespace(self.namespace, NameSpace.KV_STORE_ENTITY_CHUNKS): + # Parse chunk_ids JSON string back to list + chunk_ids = response.get("chunk_ids", []) + if isinstance(chunk_ids, str): + try: + chunk_ids = json.loads(chunk_ids) + except json.JSONDecodeError: + chunk_ids = [] + response["chunk_ids"] = chunk_ids + create_time = response.get("create_time", 0) + update_time = response.get("update_time", 0) + response["create_time"] = create_time + response["update_time"] = create_time if update_time == 0 else update_time + + # Special handling for RELATION_CHUNKS namespace + if response and is_namespace( + self.namespace, NameSpace.KV_STORE_RELATION_CHUNKS + ): + # Parse chunk_ids JSON string back to list + chunk_ids = response.get("chunk_ids", []) + if isinstance(chunk_ids, str): + try: + chunk_ids = json.loads(chunk_ids) + except json.JSONDecodeError: + chunk_ids = [] + response["chunk_ids"] = chunk_ids + create_time = response.get("create_time", 0) + update_time = response.get("update_time", 0) + response["create_time"] = create_time + response["update_time"] = create_time if update_time == 0 else update_time + return response if response else None # Query by id @@ -1946,6 +1871,38 @@ class PGKVStorage(BaseKVStorage): result["create_time"] = create_time result["update_time"] = create_time if update_time == 0 else update_time + # Special handling for ENTITY_CHUNKS namespace + if results and is_namespace(self.namespace, NameSpace.KV_STORE_ENTITY_CHUNKS): + for result in results: + # Parse chunk_ids JSON string back to list + chunk_ids = result.get("chunk_ids", []) + if isinstance(chunk_ids, str): + try: + chunk_ids = json.loads(chunk_ids) + except json.JSONDecodeError: + chunk_ids = [] + result["chunk_ids"] = chunk_ids + create_time = result.get("create_time", 0) + update_time = result.get("update_time", 0) + result["create_time"] = create_time + result["update_time"] = create_time if update_time == 0 else update_time + + # Special handling for RELATION_CHUNKS namespace + if results and is_namespace(self.namespace, NameSpace.KV_STORE_RELATION_CHUNKS): + for result in results: + # Parse chunk_ids JSON string back to list + chunk_ids = result.get("chunk_ids", []) + if isinstance(chunk_ids, str): + try: + chunk_ids = json.loads(chunk_ids) + except json.JSONDecodeError: + chunk_ids = [] + result["chunk_ids"] = chunk_ids + create_time = result.get("create_time", 0) + update_time = result.get("update_time", 0) + result["create_time"] = create_time + result["update_time"] = create_time if update_time == 0 else update_time + return _order_results(results) async def filter_keys(self, keys: set[str]) -> set[str]: @@ -2050,11 +2007,61 @@ class PGKVStorage(BaseKVStorage): "update_time": current_time, } await self.db.execute(upsert_sql, _data) + elif is_namespace(self.namespace, NameSpace.KV_STORE_ENTITY_CHUNKS): + # Get current UTC time and convert to naive datetime for database storage + current_time = datetime.datetime.now(timezone.utc).replace(tzinfo=None) + for k, v in data.items(): + upsert_sql = SQL_TEMPLATES["upsert_entity_chunks"] + _data = { + "workspace": self.workspace, + "id": k, + "chunk_ids": json.dumps(v["chunk_ids"]), + "count": v["count"], + "create_time": current_time, + "update_time": current_time, + } + await self.db.execute(upsert_sql, _data) + elif is_namespace(self.namespace, NameSpace.KV_STORE_RELATION_CHUNKS): + # Get current UTC time and convert to naive datetime for database storage + current_time = datetime.datetime.now(timezone.utc).replace(tzinfo=None) + for k, v in data.items(): + upsert_sql = SQL_TEMPLATES["upsert_relation_chunks"] + _data = { + "workspace": self.workspace, + "id": k, + "chunk_ids": json.dumps(v["chunk_ids"]), + "count": v["count"], + "create_time": current_time, + "update_time": current_time, + } + await self.db.execute(upsert_sql, _data) async def index_done_callback(self) -> None: # PG handles persistence automatically pass + async def is_empty(self) -> bool: + """Check if the storage is empty for the current workspace and namespace + + Returns: + bool: True if storage is empty, False otherwise + """ + table_name = namespace_to_table_name(self.namespace) + if not table_name: + logger.error( + f"[{self.workspace}] Unknown namespace for is_empty check: {self.namespace}" + ) + return True + + sql = f"SELECT EXISTS(SELECT 1 FROM {table_name} WHERE workspace=$1 LIMIT 1) as has_data" + + try: + result = await self.db.query(sql, [self.workspace]) + return not result.get("has_data", False) if result else True + except Exception as e: + logger.error(f"[{self.workspace}] Error checking if storage is empty: {e}") + return True + async def delete(self, ids: list[str]) -> None: """Delete specific records from storage by their IDs @@ -2970,6 +2977,28 @@ class PGDocStatusStorage(DocStatusStorage): # PG handles persistence automatically pass + async def is_empty(self) -> bool: + """Check if the storage is empty for the current workspace and namespace + + Returns: + bool: True if storage is empty, False otherwise + """ + table_name = namespace_to_table_name(self.namespace) + if not table_name: + logger.error( + f"[{self.workspace}] Unknown namespace for is_empty check: {self.namespace}" + ) + return True + + sql = f"SELECT EXISTS(SELECT 1 FROM {table_name} WHERE workspace=$1 LIMIT 1) as has_data" + + try: + result = await self.db.query(sql, [self.workspace]) + return not result.get("has_data", False) if result else True + except Exception as e: + logger.error(f"[{self.workspace}] Error checking if storage is empty: {e}") + return True + async def delete(self, ids: list[str]) -> None: """Delete specific records from storage by their IDs @@ -4721,6 +4750,8 @@ NAMESPACE_TABLE_MAP = { NameSpace.KV_STORE_TEXT_CHUNKS: "LIGHTRAG_DOC_CHUNKS", NameSpace.KV_STORE_FULL_ENTITIES: "LIGHTRAG_FULL_ENTITIES", NameSpace.KV_STORE_FULL_RELATIONS: "LIGHTRAG_FULL_RELATIONS", + NameSpace.KV_STORE_ENTITY_CHUNKS: "LIGHTRAG_ENTITY_CHUNKS", + NameSpace.KV_STORE_RELATION_CHUNKS: "LIGHTRAG_RELATION_CHUNKS", NameSpace.KV_STORE_LLM_RESPONSE_CACHE: "LIGHTRAG_LLM_CACHE", NameSpace.VECTOR_STORE_CHUNKS: "LIGHTRAG_VDB_CHUNKS", NameSpace.VECTOR_STORE_ENTITIES: "LIGHTRAG_VDB_ENTITY", @@ -4861,6 +4892,28 @@ TABLES = { CONSTRAINT LIGHTRAG_FULL_RELATIONS_PK PRIMARY KEY (workspace, id) )""" }, + "LIGHTRAG_ENTITY_CHUNKS": { + "ddl": """CREATE TABLE LIGHTRAG_ENTITY_CHUNKS ( + id VARCHAR(512), + workspace VARCHAR(255), + chunk_ids JSONB, + count INTEGER, + create_time TIMESTAMP(0) DEFAULT CURRENT_TIMESTAMP, + update_time TIMESTAMP(0) DEFAULT CURRENT_TIMESTAMP, + CONSTRAINT LIGHTRAG_ENTITY_CHUNKS_PK PRIMARY KEY (workspace, id) + )""" + }, + "LIGHTRAG_RELATION_CHUNKS": { + "ddl": """CREATE TABLE LIGHTRAG_RELATION_CHUNKS ( + id VARCHAR(512), + workspace VARCHAR(255), + chunk_ids JSONB, + count INTEGER, + create_time TIMESTAMP(0) DEFAULT CURRENT_TIMESTAMP, + update_time TIMESTAMP(0) DEFAULT CURRENT_TIMESTAMP, + CONSTRAINT LIGHTRAG_RELATION_CHUNKS_PK PRIMARY KEY (workspace, id) + )""" + }, } @@ -4918,6 +4971,26 @@ SQL_TEMPLATES = { EXTRACT(EPOCH FROM update_time)::BIGINT as update_time FROM LIGHTRAG_FULL_RELATIONS WHERE workspace=$1 AND id = ANY($2) """, + "get_by_id_entity_chunks": """SELECT id, chunk_ids, count, + EXTRACT(EPOCH FROM create_time)::BIGINT as create_time, + EXTRACT(EPOCH FROM update_time)::BIGINT as update_time + FROM LIGHTRAG_ENTITY_CHUNKS WHERE workspace=$1 AND id=$2 + """, + "get_by_id_relation_chunks": """SELECT id, chunk_ids, count, + EXTRACT(EPOCH FROM create_time)::BIGINT as create_time, + EXTRACT(EPOCH FROM update_time)::BIGINT as update_time + FROM LIGHTRAG_RELATION_CHUNKS WHERE workspace=$1 AND id=$2 + """, + "get_by_ids_entity_chunks": """SELECT id, chunk_ids, count, + EXTRACT(EPOCH FROM create_time)::BIGINT as create_time, + EXTRACT(EPOCH FROM update_time)::BIGINT as update_time + FROM LIGHTRAG_ENTITY_CHUNKS WHERE workspace=$1 AND id = ANY($2) + """, + "get_by_ids_relation_chunks": """SELECT id, chunk_ids, count, + EXTRACT(EPOCH FROM create_time)::BIGINT as create_time, + EXTRACT(EPOCH FROM update_time)::BIGINT as update_time + FROM LIGHTRAG_RELATION_CHUNKS WHERE workspace=$1 AND id = ANY($2) + """, "filter_keys": "SELECT id FROM {table_name} WHERE workspace=$1 AND id IN ({ids})", "upsert_doc_full": """INSERT INTO LIGHTRAG_DOC_FULL (id, content, doc_name, workspace) VALUES ($1, $2, $3, $4) @@ -4965,6 +5038,22 @@ SQL_TEMPLATES = { count=EXCLUDED.count, update_time = EXCLUDED.update_time """, + "upsert_entity_chunks": """INSERT INTO LIGHTRAG_ENTITY_CHUNKS (workspace, id, chunk_ids, count, + create_time, update_time) + VALUES ($1, $2, $3, $4, $5, $6) + ON CONFLICT (workspace,id) DO UPDATE + SET chunk_ids=EXCLUDED.chunk_ids, + count=EXCLUDED.count, + update_time = EXCLUDED.update_time + """, + "upsert_relation_chunks": """INSERT INTO LIGHTRAG_RELATION_CHUNKS (workspace, id, chunk_ids, count, + create_time, update_time) + VALUES ($1, $2, $3, $4, $5, $6) + ON CONFLICT (workspace,id) DO UPDATE + SET chunk_ids=EXCLUDED.chunk_ids, + count=EXCLUDED.count, + update_time = EXCLUDED.update_time + """, # SQL for VectorStorage "upsert_chunk": """INSERT INTO LIGHTRAG_VDB_CHUNKS (workspace, id, tokens, chunk_order_index, full_doc_id, content, content_vector, file_path, diff --git a/lightrag/kg/redis_impl.py b/lightrag/kg/redis_impl.py index 56569dda..8a393497 100644 --- a/lightrag/kg/redis_impl.py +++ b/lightrag/kg/redis_impl.py @@ -304,51 +304,6 @@ class RedisKVStorage(BaseKVStorage): logger.error(f"[{self.workspace}] JSON decode error in batch get: {e}") return [None] * len(ids) - async def get_all(self) -> dict[str, Any]: - """Get all data from storage - - Returns: - Dictionary containing all stored data - """ - async with self._get_redis_connection() as redis: - try: - # Get all keys for this namespace - keys = await redis.keys(f"{self.final_namespace}:*") - - if not keys: - return {} - - # Get all values in batch - pipe = redis.pipeline() - for key in keys: - pipe.get(key) - values = await pipe.execute() - - # Build result dictionary - result = {} - for key, value in zip(keys, values): - if value: - # Extract the ID part (after namespace:) - key_id = key.split(":", 1)[1] - try: - data = json.loads(value) - # Ensure time fields are present for all documents - data.setdefault("create_time", 0) - data.setdefault("update_time", 0) - result[key_id] = data - except json.JSONDecodeError as e: - logger.error( - f"[{self.workspace}] JSON decode error for key {key}: {e}" - ) - continue - - return result - except Exception as e: - logger.error( - f"[{self.workspace}] Error getting all data from Redis: {e}" - ) - return {} - async def filter_keys(self, keys: set[str]) -> set[str]: async with self._get_redis_connection() as redis: pipe = redis.pipeline() @@ -407,8 +362,24 @@ class RedisKVStorage(BaseKVStorage): # Redis handles persistence automatically pass + async def is_empty(self) -> bool: + """Check if the storage is empty for the current workspace and namespace + + Returns: + bool: True if storage is empty, False otherwise + """ + pattern = f"{self.namespace}:{self.workspace}:*" + try: + # Use scan to check if any keys exist + async for key in self.redis.scan_iter(match=pattern, count=1): + return False # Found at least one key + return True # No keys found + except Exception as e: + logger.error(f"[{self.workspace}] Error checking if storage is empty: {e}") + return True + async def delete(self, ids: list[str]) -> None: - """Delete entries with specified IDs""" + """Delete specific records from storage by their IDs""" if not ids: return @@ -868,6 +839,23 @@ class RedisDocStatusStorage(DocStatusStorage): """Redis handles persistence automatically""" pass + async def is_empty(self) -> bool: + """Check if the storage is empty for the current workspace and namespace + + Returns: + bool: True if storage is empty, False otherwise + """ + pattern = f"{self.final_namespace}:*" + try: + async with self._get_redis_connection() as redis: + # Use scan to check if any keys exist + async for key in redis.scan_iter(match=pattern, count=1): + return False # Found at least one key + return True # No keys found + except Exception as e: + logger.error(f"[{self.workspace}] Error checking if storage is empty: {e}") + return True + @redis_retry async def upsert(self, data: dict[str, dict[str, Any]]) -> None: """Insert or update document status data""" diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 056b5bca..1f32da50 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -41,10 +41,12 @@ from lightrag.constants import ( DEFAULT_MAX_PARALLEL_INSERT, DEFAULT_MAX_GRAPH_NODES, DEFAULT_MAX_SOURCE_IDS_PER_ENTITY, + DEFAULT_MAX_SOURCE_IDS_PER_RELATION, DEFAULT_ENTITY_TYPES, DEFAULT_SUMMARY_LANGUAGE, DEFAULT_LLM_TIMEOUT, DEFAULT_EMBEDDING_TIMEOUT, + DEFAULT_SOURCE_IDS_LIMIT_METHOD, ) from lightrag.utils import get_env_value @@ -99,6 +101,9 @@ from lightrag.utils import ( generate_track_id, convert_to_user_format, logger, + subtract_source_ids, + make_relation_chunk_key, + normalize_source_ids_limit_method, ) from lightrag.types import KnowledgeGraph from dotenv import load_dotenv @@ -362,10 +367,32 @@ class LightRAG: """Maximum number of graph nodes to return in knowledge graph queries.""" max_source_ids_per_entity: int = field( - default=get_env_value("MAX_SOURCE_IDS_PER_ENTITY", DEFAULT_MAX_SOURCE_IDS_PER_ENTITY, int) + default=get_env_value( + "MAX_SOURCE_IDS_PER_ENTITY", DEFAULT_MAX_SOURCE_IDS_PER_ENTITY, int + ) ) """Maximum number of source (chunk) ids in entity Grpah + VDB.""" + max_source_ids_per_relation: int = field( + default=get_env_value( + "MAX_SOURCE_IDS_PER_RELATION", + DEFAULT_MAX_SOURCE_IDS_PER_RELATION, + int, + ) + ) + """Maximum number of source (chunk) ids in relation Graph + VDB.""" + + source_ids_limit_method: str = field( + default_factory=lambda: normalize_source_ids_limit_method( + get_env_value( + "SOURCE_IDS_LIMIT_METHOD", + DEFAULT_SOURCE_IDS_LIMIT_METHOD, + str, + ) + ) + ) + """Strategy for enforcing source_id limits: IGNORE_NEW or FIFO.""" + addon_params: dict[str, Any] = field( default_factory=lambda: { "language": get_env_value( @@ -535,6 +562,18 @@ class LightRAG: embedding_func=self.embedding_func, ) + self.entity_chunks: BaseKVStorage = self.key_string_value_json_storage_cls( # type: ignore + namespace=NameSpace.KV_STORE_ENTITY_CHUNKS, + workspace=self.workspace, + embedding_func=self.embedding_func, + ) + + self.relation_chunks: BaseKVStorage = self.key_string_value_json_storage_cls( # type: ignore + namespace=NameSpace.KV_STORE_RELATION_CHUNKS, + workspace=self.workspace, + embedding_func=self.embedding_func, + ) + self.chunk_entity_relation_graph: BaseGraphStorage = self.graph_storage_cls( # type: ignore namespace=NameSpace.GRAPH_STORE_CHUNK_ENTITY_RELATION, workspace=self.workspace, @@ -594,6 +633,8 @@ class LightRAG: self.text_chunks, self.full_entities, self.full_relations, + self.entity_chunks, + self.relation_chunks, self.entities_vdb, self.relationships_vdb, self.chunks_vdb, @@ -616,6 +657,8 @@ class LightRAG: ("text_chunks", self.text_chunks), ("full_entities", self.full_entities), ("full_relations", self.full_relations), + ("entity_chunks", self.entity_chunks), + ("relation_chunks", self.relation_chunks), ("entities_vdb", self.entities_vdb), ("relationships_vdb", self.relationships_vdb), ("chunks_vdb", self.chunks_vdb), @@ -671,6 +714,13 @@ class LightRAG: logger.debug("No entities found in graph, skipping migration check") return + try: + # Initialize chunk tracking storage after migration + await self._migrate_chunk_tracking_storage() + except Exception as e: + logger.error(f"Error during chunk_tracking migration: {e}") + raise e + # Check if full_entities and full_relations are empty # Get all processed documents to check their entity/relation data try: @@ -711,11 +761,11 @@ class LightRAG: except Exception as e: logger.error(f"Error during migration check: {e}") - # Don't raise the error, just log it to avoid breaking initialization + raise e except Exception as e: logger.error(f"Error in data migration check: {e}") - # Don't raise the error to avoid breaking initialization + raise e async def _migrate_entity_relation_data(self, processed_docs: dict): """Migrate existing entity and relation data to full_entities and full_relations storage""" @@ -814,6 +864,140 @@ class LightRAG: f"Data migration completed: migrated {migration_count} documents with entities/relations" ) + async def _migrate_chunk_tracking_storage(self) -> None: + """Ensure entity/relation chunk tracking KV stores exist and are seeded.""" + + if not self.entity_chunks or not self.relation_chunks: + return + + need_entity_migration = False + need_relation_migration = False + + try: + need_entity_migration = await self.entity_chunks.is_empty() + except Exception as exc: # pragma: no cover - defensive logging + logger.error(f"Failed to check entity chunks storage: {exc}") + need_entity_migration = True + + try: + need_relation_migration = await self.relation_chunks.is_empty() + except Exception as exc: # pragma: no cover - defensive logging + logger.error(f"Failed to check relation chunks storage: {exc}") + need_relation_migration = True + + if not need_entity_migration and not need_relation_migration: + return + + BATCH_SIZE = 500 # Process 500 records per batch + + if need_entity_migration: + try: + nodes = await self.chunk_entity_relation_graph.get_all_nodes() + except Exception as exc: + logger.error(f"Failed to fetch nodes for chunk migration: {exc}") + nodes = [] + + logger.info(f"Starting chunk_tracking data migration: {len(nodes)} nodes") + + # Process nodes in batches + total_nodes = len(nodes) + total_batches = (total_nodes + BATCH_SIZE - 1) // BATCH_SIZE + total_migrated = 0 + + for batch_idx in range(total_batches): + start_idx = batch_idx * BATCH_SIZE + end_idx = min((batch_idx + 1) * BATCH_SIZE, total_nodes) + batch_nodes = nodes[start_idx:end_idx] + + upsert_payload: dict[str, dict[str, object]] = {} + for node in batch_nodes: + entity_id = node.get("entity_id") or node.get("id") + if not entity_id: + continue + + raw_source = node.get("source_id") or "" + chunk_ids = [ + chunk_id + for chunk_id in raw_source.split(GRAPH_FIELD_SEP) + if chunk_id + ] + if not chunk_ids: + continue + + upsert_payload[entity_id] = { + "chunk_ids": chunk_ids, + "count": len(chunk_ids), + } + + if upsert_payload: + await self.entity_chunks.upsert(upsert_payload) + total_migrated += len(upsert_payload) + logger.info( + f"Processed entity batch {batch_idx + 1}/{total_batches}: {len(upsert_payload)} records (total: {total_migrated}/{total_nodes})" + ) + + if total_migrated > 0: + # Persist entity_chunks data to disk + await self.entity_chunks.index_done_callback() + logger.info( + f"Entity chunk_tracking migration completed: {total_migrated} records persisted" + ) + + if need_relation_migration: + try: + edges = await self.chunk_entity_relation_graph.get_all_edges() + except Exception as exc: + logger.error(f"Failed to fetch edges for chunk migration: {exc}") + edges = [] + + logger.info(f"Starting chunk_tracking data migration: {len(edges)} edges") + + # Process edges in batches + total_edges = len(edges) + total_batches = (total_edges + BATCH_SIZE - 1) // BATCH_SIZE + total_migrated = 0 + + for batch_idx in range(total_batches): + start_idx = batch_idx * BATCH_SIZE + end_idx = min((batch_idx + 1) * BATCH_SIZE, total_edges) + batch_edges = edges[start_idx:end_idx] + + upsert_payload: dict[str, dict[str, object]] = {} + for edge in batch_edges: + src = edge.get("source") or edge.get("src_id") or edge.get("src") + tgt = edge.get("target") or edge.get("tgt_id") or edge.get("tgt") + if not src or not tgt: + continue + + raw_source = edge.get("source_id") or "" + chunk_ids = [ + chunk_id + for chunk_id in raw_source.split(GRAPH_FIELD_SEP) + if chunk_id + ] + if not chunk_ids: + continue + + storage_key = make_relation_chunk_key(src, tgt) + upsert_payload[storage_key] = { + "chunk_ids": chunk_ids, + "count": len(chunk_ids), + } + + if upsert_payload: + await self.relation_chunks.upsert(upsert_payload) + total_migrated += len(upsert_payload) + logger.info( + f"Processed relation batch {batch_idx + 1}/{total_batches}: {len(upsert_payload)} records (total: {total_migrated}/{total_edges})" + ) + + if total_migrated > 0: + # Persist relation_chunks data to disk + await self.relation_chunks.index_done_callback() + logger.info( + f"Relation chunk_tracking migration completed: {total_migrated} records persisted" + ) + async def get_graph_labels(self): text = await self.chunk_entity_relation_graph.get_all_labels() return text @@ -1676,6 +1860,8 @@ class LightRAG: pipeline_status=pipeline_status, pipeline_status_lock=pipeline_status_lock, llm_response_cache=self.llm_response_cache, + entity_chunks_storage=self.entity_chunks, + relation_chunks_storage=self.relation_chunks, current_file_number=current_file_number, total_files=total_files, file_path=file_path, @@ -1845,6 +2031,8 @@ class LightRAG: self.text_chunks, self.full_entities, self.full_relations, + self.entity_chunks, + self.relation_chunks, self.llm_response_cache, self.entities_vdb, self.relationships_vdb, @@ -2718,9 +2906,11 @@ class LightRAG: # 4. Analyze entities and relationships that will be affected entities_to_delete = set() - entities_to_rebuild = {} # entity_name -> remaining_chunk_ids + entities_to_rebuild = {} # entity_name -> remaining chunk id list relationships_to_delete = set() - relationships_to_rebuild = {} # (src, tgt) -> remaining_chunk_ids + relationships_to_rebuild = {} # (src, tgt) -> remaining chunk id list + entity_chunk_updates: dict[str, list[str]] = {} + relation_chunk_updates: dict[tuple[str, str], list[str]] = {} try: # Get affected entities and relations from full_entities and full_relations storage @@ -2776,14 +2966,41 @@ class LightRAG: # Process entities for node_data in affected_nodes: node_label = node_data.get("entity_id") - if node_label and "source_id" in node_data: - sources = set(node_data["source_id"].split(GRAPH_FIELD_SEP)) - remaining_sources = sources - chunk_ids + if not node_label: + continue - if not remaining_sources: - entities_to_delete.add(node_label) - elif remaining_sources != sources: - entities_to_rebuild[node_label] = remaining_sources + existing_sources: list[str] = [] + if self.entity_chunks: + stored_chunks = await self.entity_chunks.get_by_id(node_label) + if stored_chunks and isinstance(stored_chunks, dict): + existing_sources = [ + chunk_id + for chunk_id in stored_chunks.get("chunk_ids", []) + if chunk_id + ] + + if not existing_sources and node_data.get("source_id"): + existing_sources = [ + chunk_id + for chunk_id in node_data["source_id"].split( + GRAPH_FIELD_SEP + ) + if chunk_id + ] + + if not existing_sources: + continue + + remaining_sources = subtract_source_ids(existing_sources, chunk_ids) + + if not remaining_sources: + entities_to_delete.add(node_label) + entity_chunk_updates[node_label] = [] + elif remaining_sources != existing_sources: + entities_to_rebuild[node_label] = remaining_sources + entity_chunk_updates[node_label] = remaining_sources + else: + logger.info(f"Untouch entity: {node_label}") async with pipeline_status_lock: log_message = f"Found {len(entities_to_rebuild)} affected entities" @@ -2796,21 +3013,51 @@ class LightRAG: src = edge_data.get("source") tgt = edge_data.get("target") - if src and tgt and "source_id" in edge_data: - edge_tuple = tuple(sorted((src, tgt))) - if ( - edge_tuple in relationships_to_delete - or edge_tuple in relationships_to_rebuild - ): - continue + if not src or not tgt or "source_id" not in edge_data: + continue - sources = set(edge_data["source_id"].split(GRAPH_FIELD_SEP)) - remaining_sources = sources - chunk_ids + edge_tuple = tuple(sorted((src, tgt))) + if ( + edge_tuple in relationships_to_delete + or edge_tuple in relationships_to_rebuild + ): + continue - if not remaining_sources: - relationships_to_delete.add(edge_tuple) - elif remaining_sources != sources: - relationships_to_rebuild[edge_tuple] = remaining_sources + existing_sources: list[str] = [] + if self.relation_chunks: + storage_key = make_relation_chunk_key(src, tgt) + stored_chunks = await self.relation_chunks.get_by_id( + storage_key + ) + if stored_chunks and isinstance(stored_chunks, dict): + existing_sources = [ + chunk_id + for chunk_id in stored_chunks.get("chunk_ids", []) + if chunk_id + ] + + if not existing_sources: + existing_sources = [ + chunk_id + for chunk_id in edge_data["source_id"].split( + GRAPH_FIELD_SEP + ) + if chunk_id + ] + + if not existing_sources: + continue + + remaining_sources = subtract_source_ids(existing_sources, chunk_ids) + + if not remaining_sources: + relationships_to_delete.add(edge_tuple) + relation_chunk_updates[edge_tuple] = [] + elif remaining_sources != existing_sources: + relationships_to_rebuild[edge_tuple] = remaining_sources + relation_chunk_updates[edge_tuple] = remaining_sources + else: + logger.info(f"Untouch relation: {edge_tuple}") async with pipeline_status_lock: log_message = ( @@ -2820,6 +3067,45 @@ class LightRAG: pipeline_status["latest_message"] = log_message pipeline_status["history_messages"].append(log_message) + current_time = int(time.time()) + + if entity_chunk_updates and self.entity_chunks: + entity_upsert_payload = {} + entity_delete_ids: set[str] = set() + for entity_name, remaining in entity_chunk_updates.items(): + if not remaining: + entity_delete_ids.add(entity_name) + else: + entity_upsert_payload[entity_name] = { + "chunk_ids": remaining, + "count": len(remaining), + "updated_at": current_time, + } + + if entity_delete_ids: + await self.entity_chunks.delete(list(entity_delete_ids)) + if entity_upsert_payload: + await self.entity_chunks.upsert(entity_upsert_payload) + + if relation_chunk_updates and self.relation_chunks: + relation_upsert_payload = {} + relation_delete_ids: set[str] = set() + for edge_tuple, remaining in relation_chunk_updates.items(): + storage_key = make_relation_chunk_key(*edge_tuple) + if not remaining: + relation_delete_ids.add(storage_key) + else: + relation_upsert_payload[storage_key] = { + "chunk_ids": remaining, + "count": len(remaining), + "updated_at": current_time, + } + + if relation_delete_ids: + await self.relation_chunks.delete(list(relation_delete_ids)) + if relation_upsert_payload: + await self.relation_chunks.upsert(relation_upsert_payload) + except Exception as e: logger.error(f"Failed to process graph analysis results: {e}") raise Exception(f"Failed to process graph dependencies: {e}") from e @@ -2914,6 +3200,8 @@ class LightRAG: global_config=asdict(self), pipeline_status=pipeline_status, pipeline_status_lock=pipeline_status_lock, + entity_chunks_storage=self.entity_chunks, + relation_chunks_storage=self.relation_chunks, ) except Exception as e: diff --git a/lightrag/namespace.py b/lightrag/namespace.py index 2acfe9a4..eccd168d 100644 --- a/lightrag/namespace.py +++ b/lightrag/namespace.py @@ -10,6 +10,8 @@ class NameSpace: KV_STORE_LLM_RESPONSE_CACHE = "llm_response_cache" KV_STORE_FULL_ENTITIES = "full_entities" KV_STORE_FULL_RELATIONS = "full_relations" + KV_STORE_ENTITY_CHUNKS = "entity_chunks" + KV_STORE_RELATION_CHUNKS = "relation_chunks" VECTOR_STORE_ENTITIES = "entities" VECTOR_STORE_RELATIONSHIPS = "relationships" diff --git a/lightrag/operate.py b/lightrag/operate.py index c27f417d..2f7f6340 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -7,7 +7,7 @@ import json_repair from typing import Any, AsyncIterator, overload, Literal from collections import Counter, defaultdict -from .utils import ( +from lightrag.utils import ( logger, compute_mdhash_id, Tokenizer, @@ -27,14 +27,16 @@ from .utils import ( pick_by_vector_similarity, process_chunks_unified, build_file_path, - truncate_entity_source_id, safe_vdb_operation_with_exception, create_prefixed_exception, fix_tuple_delimiter_corruption, convert_to_user_format, generate_reference_list_from_chunks, + apply_source_ids_limit, + merge_source_ids, + make_relation_chunk_key, ) -from .base import ( +from lightrag.base import ( BaseGraphStorage, BaseKVStorage, BaseVectorStorage, @@ -43,8 +45,8 @@ from .base import ( QueryResult, QueryContextResult, ) -from .prompt import PROMPTS -from .constants import ( +from lightrag.prompt import PROMPTS +from lightrag.constants import ( GRAPH_FIELD_SEP, DEFAULT_MAX_ENTITY_TOKENS, DEFAULT_MAX_RELATION_TOKENS, @@ -53,8 +55,9 @@ from .constants import ( DEFAULT_KG_CHUNK_PICK_METHOD, DEFAULT_ENTITY_TYPES, DEFAULT_SUMMARY_LANGUAGE, + SOURCE_IDS_LIMIT_METHOD_KEEP, ) -from .kg.shared_storage import get_storage_keyed_lock +from lightrag.kg.shared_storage import get_storage_keyed_lock import time from dotenv import load_dotenv @@ -474,8 +477,8 @@ async def _handle_single_relationship_extraction( async def _rebuild_knowledge_from_chunks( - entities_to_rebuild: dict[str, set[str]], - relationships_to_rebuild: dict[tuple[str, str], set[str]], + entities_to_rebuild: dict[str, list[str]], + relationships_to_rebuild: dict[tuple[str, str], list[str]], knowledge_graph_inst: BaseGraphStorage, entities_vdb: BaseVectorStorage, relationships_vdb: BaseVectorStorage, @@ -484,6 +487,8 @@ async def _rebuild_knowledge_from_chunks( global_config: dict[str, str], pipeline_status: dict | None = None, pipeline_status_lock=None, + entity_chunks_storage: BaseKVStorage | None = None, + relation_chunks_storage: BaseKVStorage | None = None, ) -> None: """Rebuild entity and relationship descriptions from cached extraction results with parallel processing @@ -492,8 +497,8 @@ async def _rebuild_knowledge_from_chunks( controlled by llm_model_max_async and using get_storage_keyed_lock for data consistency. Args: - entities_to_rebuild: Dict mapping entity_name -> set of remaining chunk_ids - relationships_to_rebuild: Dict mapping (src, tgt) -> set of remaining chunk_ids + entities_to_rebuild: Dict mapping entity_name -> list of remaining chunk_ids + relationships_to_rebuild: Dict mapping (src, tgt) -> list of remaining chunk_ids knowledge_graph_inst: Knowledge graph storage entities_vdb: Entity vector database relationships_vdb: Relationship vector database @@ -502,6 +507,8 @@ async def _rebuild_knowledge_from_chunks( global_config: Global configuration containing llm_model_max_async pipeline_status: Pipeline status dictionary pipeline_status_lock: Lock for pipeline status + entity_chunks_storage: KV storage maintaining full chunk IDs per entity + relation_chunks_storage: KV storage maintaining full chunk IDs per relation """ if not entities_to_rebuild and not relationships_to_rebuild: return @@ -641,10 +648,11 @@ async def _rebuild_knowledge_from_chunks( chunk_entities=chunk_entities, llm_response_cache=llm_response_cache, global_config=global_config, + entity_chunks_storage=entity_chunks_storage, ) rebuilt_entities_count += 1 status_message = ( - f"Rebuilt `{entity_name}` from {len(chunk_ids)} chunks" + f"Rebuild `{entity_name}` from {len(chunk_ids)} chunks" ) logger.info(status_message) if pipeline_status is not None and pipeline_status_lock is not None: @@ -682,16 +690,11 @@ async def _rebuild_knowledge_from_chunks( chunk_relationships=chunk_relationships, llm_response_cache=llm_response_cache, global_config=global_config, + relation_chunks_storage=relation_chunks_storage, + pipeline_status=pipeline_status, + pipeline_status_lock=pipeline_status_lock, ) rebuilt_relationships_count += 1 - status_message = ( - f"Rebuilt `{src} - {tgt}` from {len(chunk_ids)} chunks" - ) - logger.info(status_message) - if pipeline_status is not None and pipeline_status_lock is not None: - async with pipeline_status_lock: - pipeline_status["latest_message"] = status_message - pipeline_status["history_messages"].append(status_message) except Exception as e: failed_relationships_count += 1 status_message = f"Failed to rebuild `{src} - {tgt}`: {e}" @@ -1002,10 +1005,13 @@ async def _rebuild_single_entity( knowledge_graph_inst: BaseGraphStorage, entities_vdb: BaseVectorStorage, entity_name: str, - chunk_ids: set[str], + chunk_ids: list[str], chunk_entities: dict, llm_response_cache: BaseKVStorage, global_config: dict[str, str], + entity_chunks_storage: BaseKVStorage | None = None, + pipeline_status: dict | None = None, + pipeline_status_lock=None, ) -> None: """Rebuild a single entity from cached extraction results""" @@ -1016,7 +1022,11 @@ async def _rebuild_single_entity( # Helper function to update entity in both graph and vector storage async def _update_entity_storage( - final_description: str, entity_type: str, file_paths: set[str] + final_description: str, + entity_type: str, + file_paths: set[str], + source_chunk_ids: list[str], + truncation_info: str = "", ): try: # Update entity in graph storage (critical path) @@ -1024,10 +1034,12 @@ async def _rebuild_single_entity( **current_entity, "description": final_description, "entity_type": entity_type, - "source_id": GRAPH_FIELD_SEP.join(chunk_ids), + "source_id": GRAPH_FIELD_SEP.join(source_chunk_ids), "file_path": GRAPH_FIELD_SEP.join(file_paths) if file_paths else current_entity.get("file_path", "unknown_source"), + "created_at": int(time.time()), + "truncate": truncation_info, } await knowledge_graph_inst.upsert_node(entity_name, updated_entity_data) @@ -1060,9 +1072,33 @@ async def _rebuild_single_entity( logger.error(error_msg) raise # Re-raise exception - # Collect all entity data from relevant chunks + # normalized_chunk_ids = merge_source_ids([], chunk_ids) + normalized_chunk_ids = chunk_ids + + if entity_chunks_storage is not None and normalized_chunk_ids: + await entity_chunks_storage.upsert( + { + entity_name: { + "chunk_ids": normalized_chunk_ids, + "count": len(normalized_chunk_ids), + } + } + ) + + limit_method = ( + global_config.get("source_ids_limit_method") or SOURCE_IDS_LIMIT_METHOD_KEEP + ) + + limited_chunk_ids = apply_source_ids_limit( + normalized_chunk_ids, + global_config["max_source_ids_per_entity"], + limit_method, + identifier=f"`{entity_name}`", + ) + + # Collect all entity data from relevant (limited) chunks all_entity_data = [] - for chunk_id in chunk_ids: + for chunk_id in limited_chunk_ids: if chunk_id in chunk_entities and entity_name in chunk_entities[chunk_id]: all_entity_data.extend(chunk_entities[chunk_id][entity_name]) @@ -1109,7 +1145,12 @@ async def _rebuild_single_entity( final_description = current_entity.get("description", "") entity_type = current_entity.get("entity_type", "UNKNOWN") - await _update_entity_storage(final_description, entity_type, file_paths) + await _update_entity_storage( + final_description, + entity_type, + file_paths, + limited_chunk_ids, + ) return # Process cached entity data @@ -1149,7 +1190,31 @@ async def _rebuild_single_entity( else: final_description = current_entity.get("description", "") - await _update_entity_storage(final_description, entity_type, file_paths) + if len(limited_chunk_ids) < len(normalized_chunk_ids): + truncation_info = ( + f"{limit_method}:{len(limited_chunk_ids)}/{len(normalized_chunk_ids)}" + ) + else: + truncation_info = "" + + await _update_entity_storage( + final_description, + entity_type, + file_paths, + limited_chunk_ids, + truncation_info, + ) + + # Log rebuild completion with truncation info + status_message = f"Rebuild `{entity_name}` from {len(chunk_ids)} chunks" + if truncation_info: + status_message += f" ({truncation_info})" + logger.info(status_message) + # Update pipeline status + if pipeline_status is not None and pipeline_status_lock is not None: + async with pipeline_status_lock: + pipeline_status["latest_message"] = status_message + pipeline_status["history_messages"].append(status_message) async def _rebuild_single_relationship( @@ -1157,10 +1222,13 @@ async def _rebuild_single_relationship( relationships_vdb: BaseVectorStorage, src: str, tgt: str, - chunk_ids: set[str], + chunk_ids: list[str], chunk_relationships: dict, llm_response_cache: BaseKVStorage, global_config: dict[str, str], + relation_chunks_storage: BaseKVStorage | None = None, + pipeline_status: dict | None = None, + pipeline_status_lock=None, ) -> None: """Rebuild a single relationship from cached extraction results @@ -1173,9 +1241,33 @@ async def _rebuild_single_relationship( if not current_relationship: return + # normalized_chunk_ids = merge_source_ids([], chunk_ids) + normalized_chunk_ids = chunk_ids + + if relation_chunks_storage is not None and normalized_chunk_ids: + storage_key = make_relation_chunk_key(src, tgt) + await relation_chunks_storage.upsert( + { + storage_key: { + "chunk_ids": normalized_chunk_ids, + "count": len(normalized_chunk_ids), + } + } + ) + + limit_method = ( + global_config.get("source_ids_limit_method") or SOURCE_IDS_LIMIT_METHOD_KEEP + ) + limited_chunk_ids = apply_source_ids_limit( + normalized_chunk_ids, + global_config["max_source_ids_per_relation"], + limit_method, + identifier=f"`{src}`~`{tgt}`", + ) + # Collect all relationship data from relevant chunks all_relationship_data = [] - for chunk_id in chunk_ids: + for chunk_id in limited_chunk_ids: if chunk_id in chunk_relationships: # Check both (src, tgt) and (tgt, src) since relationships can be bidirectional for edge_key in [(src, tgt), (tgt, src)]: @@ -1230,6 +1322,13 @@ async def _rebuild_single_relationship( # fallback to keep current(unchanged) final_description = current_relationship.get("description", "") + if len(limited_chunk_ids) < len(normalized_chunk_ids): + truncation_info = ( + f"{limit_method}:{len(limited_chunk_ids)}/{len(normalized_chunk_ids)}" + ) + else: + truncation_info = "" + # Update relationship in graph storage updated_relationship_data = { **current_relationship, @@ -1238,10 +1337,11 @@ async def _rebuild_single_relationship( else current_relationship.get("description", ""), "keywords": combined_keywords, "weight": weight, - "source_id": GRAPH_FIELD_SEP.join(chunk_ids), + "source_id": GRAPH_FIELD_SEP.join(limited_chunk_ids), "file_path": GRAPH_FIELD_SEP.join([fp for fp in file_paths if fp]) if file_paths else current_relationship.get("file_path", "unknown_source"), + "truncate": truncation_info, } await knowledge_graph_inst.upsert_edge(src, tgt, updated_relationship_data) @@ -1287,6 +1387,25 @@ async def _rebuild_single_relationship( logger.error(error_msg) raise # Re-raise exception + # Log rebuild completion with truncation info + status_message = f"Rebuild `{src} - {tgt}` from {len(chunk_ids)} chunks" + if truncation_info: + status_message += f" ({truncation_info})" + # Add truncation info from apply_source_ids_limit if truncation occurred + if len(limited_chunk_ids) < len(normalized_chunk_ids): + truncation_info = ( + f" ({limit_method}:{len(limited_chunk_ids)}/{len(normalized_chunk_ids)})" + ) + status_message += truncation_info + + logger.info(status_message) + + # Update pipeline status + if pipeline_status is not None and pipeline_status_lock is not None: + async with pipeline_status_lock: + pipeline_status["latest_message"] = status_message + pipeline_status["history_messages"].append(status_message) + async def _merge_nodes_then_upsert( entity_name: str, @@ -1296,6 +1415,7 @@ async def _merge_nodes_then_upsert( pipeline_status: dict = None, pipeline_status_lock=None, llm_response_cache: BaseKVStorage | None = None, + entity_chunks_storage: BaseKVStorage | None = None, ): """Get existing nodes from knowledge graph use name,if exists, merge data, else create, then upsert.""" already_entity_types = [] @@ -1318,10 +1438,78 @@ async def _merge_nodes_then_upsert( reverse=True, )[0][0] # Get the entity type with the highest count + original_nodes_count = len(nodes_data) + + new_source_ids = [dp["source_id"] for dp in nodes_data if dp.get("source_id")] + + existing_full_source_ids = [] + if entity_chunks_storage is not None: + stored_chunks = await entity_chunks_storage.get_by_id(entity_name) + if stored_chunks and isinstance(stored_chunks, dict): + existing_full_source_ids = [ + chunk_id for chunk_id in stored_chunks.get("chunk_ids", []) if chunk_id + ] + + if not existing_full_source_ids: + existing_full_source_ids = [ + chunk_id for chunk_id in already_source_ids if chunk_id + ] + + full_source_ids = merge_source_ids(existing_full_source_ids, new_source_ids) + + if entity_chunks_storage is not None and full_source_ids: + await entity_chunks_storage.upsert( + { + entity_name: { + "chunk_ids": full_source_ids, + "count": len(full_source_ids), + } + } + ) + + limit_method = ( + global_config.get("source_ids_limit_method") or SOURCE_IDS_LIMIT_METHOD_KEEP + ) + source_ids = apply_source_ids_limit( + full_source_ids, + global_config["max_source_ids_per_entity"], + limit_method, + identifier=f"`{entity_name}`", + ) + + # Only apply filtering in IGNORE_NEW mode + if limit_method == SOURCE_IDS_LIMIT_METHOD_KEEP: + allowed_source_ids = set(source_ids) + filtered_nodes = [] + for dp in nodes_data: + source_id = dp.get("source_id") + # Skip descriptions sourced from chunks dropped by the IGNORE_NEW cap + if ( + source_id + and source_id not in allowed_source_ids + and source_id not in existing_full_source_ids + ): + continue + filtered_nodes.append(dp) + nodes_data = filtered_nodes + else: + # In FIFO mode, keep all node descriptions - truncation happens at source_ids level only + nodes_data = list(nodes_data) + + max_source_limit = global_config["max_source_ids_per_entity"] + skip_summary_due_to_limit = ( + limit_method == SOURCE_IDS_LIMIT_METHOD_KEEP + and len(existing_full_source_ids) >= max_source_limit + and not nodes_data + and already_description + ) + # Deduplicate by description, keeping first occurrence unique_nodes = {} for dp in nodes_data: - desc = dp["description"] + desc = dp.get("description") + if not desc: + continue if desc not in unique_nodes: unique_nodes[desc] = dp @@ -1332,17 +1520,31 @@ async def _merge_nodes_then_upsert( ) sorted_descriptions = [dp["description"] for dp in sorted_nodes] + truncation_info = "" + dd_message = "" + # Combine already_description with sorted new sorted descriptions description_list = already_description + sorted_descriptions + deduplicated_num = original_nodes_count - len(sorted_descriptions) + if deduplicated_num > 0: + dd_message = f"dd:{deduplicated_num}" num_fragment = len(description_list) already_fragment = len(already_description) - deduplicated_num = already_fragment + len(nodes_data) - num_fragment - if deduplicated_num > 0: - dd_message = f"(dd:{deduplicated_num})" - else: - dd_message = "" - if num_fragment > 0: + if skip_summary_due_to_limit: + description = ( + already_node.get("description", "(no description)") + if already_node + else "(no description)" + ) + llm_was_used = False + status_message = f"Skip merge for `{entity_name}`: IGNORE_NEW limit reached" + logger.debug(status_message) + if pipeline_status is not None and pipeline_status_lock is not None: + async with pipeline_status_lock: + pipeline_status["latest_message"] = status_message + pipeline_status["history_messages"].append(status_message) + elif num_fragment > 0: # Get summary and LLM usage status description, llm_was_used = await _handle_entity_relation_summary( "Entity", @@ -1355,9 +1557,16 @@ async def _merge_nodes_then_upsert( # Log based on actual LLM usage if llm_was_used: - status_message = f"LLMmrg: `{entity_name}` | {already_fragment}+{num_fragment - already_fragment}{dd_message}" + status_message = f"LLMmrg: `{entity_name}` | {already_fragment}+{num_fragment - already_fragment}" else: - status_message = f"Merged: `{entity_name}` | {already_fragment}+{num_fragment - already_fragment}{dd_message}" + status_message = f"Merged: `{entity_name}` | {already_fragment}+{num_fragment - already_fragment}" + + # Add truncation info from apply_source_ids_limit if truncation occurred + if len(source_ids) < len(full_source_ids): + truncation_info = f"{limit_method}:{len(source_ids)}/{len(full_source_ids)}" + + if dd_message or truncation_info: + status_message += f"({','.join([truncation_info, dd_message])})" if already_fragment > 0 or llm_was_used: logger.info(status_message) @@ -1372,9 +1581,6 @@ async def _merge_nodes_then_upsert( logger.error(f"Entity {entity_name} has no description") description = "(no description)" - merged_source_ids: set = set([dp["source_id"] for dp in nodes_data] + already_source_ids) - - source_ids = truncate_entity_source_id(merged_source_ids, entity_name, global_config) source_id = GRAPH_FIELD_SEP.join(source_ids) file_path = build_file_path(already_file_paths, nodes_data, entity_name) @@ -1386,6 +1592,7 @@ async def _merge_nodes_then_upsert( source_id=source_id, file_path=file_path, created_at=int(time.time()), + truncate=truncation_info, ) await knowledge_graph_inst.upsert_node( entity_name, @@ -1405,6 +1612,7 @@ async def _merge_edges_then_upsert( pipeline_status_lock=None, llm_response_cache: BaseKVStorage | None = None, added_entities: list = None, # New parameter to track entities added during edge processing + relation_chunks_storage: BaseKVStorage | None = None, ): if src_id == tgt_id: return None @@ -1448,16 +1656,84 @@ async def _merge_edges_then_upsert( ) ) + original_edges_count = len(edges_data) + + new_source_ids = [dp["source_id"] for dp in edges_data if dp.get("source_id")] + + storage_key = make_relation_chunk_key(src_id, tgt_id) + existing_full_source_ids = [] + if relation_chunks_storage is not None: + stored_chunks = await relation_chunks_storage.get_by_id(storage_key) + if stored_chunks and isinstance(stored_chunks, dict): + existing_full_source_ids = [ + chunk_id for chunk_id in stored_chunks.get("chunk_ids", []) if chunk_id + ] + + if not existing_full_source_ids: + existing_full_source_ids = [ + chunk_id for chunk_id in already_source_ids if chunk_id + ] + + full_source_ids = merge_source_ids(existing_full_source_ids, new_source_ids) + + if relation_chunks_storage is not None and full_source_ids: + await relation_chunks_storage.upsert( + { + storage_key: { + "chunk_ids": full_source_ids, + "count": len(full_source_ids), + } + } + ) + + source_ids = apply_source_ids_limit( + full_source_ids, + global_config["max_source_ids_per_relation"], + global_config.get("source_ids_limit_method"), + identifier=f"`{src_id}`~`{tgt_id}`", + ) + limit_method = ( + global_config.get("source_ids_limit_method") or SOURCE_IDS_LIMIT_METHOD_KEEP + ) + + # Only apply filtering in IGNORE_NEW mode + if limit_method == SOURCE_IDS_LIMIT_METHOD_KEEP: + allowed_source_ids = set(source_ids) + filtered_edges = [] + for dp in edges_data: + source_id = dp.get("source_id") + # Skip relationship fragments sourced from chunks dropped by the IGNORE_NEW cap + if ( + source_id + and source_id not in allowed_source_ids + and source_id not in existing_full_source_ids + ): + continue + filtered_edges.append(dp) + edges_data = filtered_edges + else: + # In FIFO mode, keep all edge descriptions - truncation happens at source_ids level only + edges_data = list(edges_data) + + max_source_limit = global_config["max_source_ids_per_relation"] + skip_summary_due_to_limit = ( + limit_method == SOURCE_IDS_LIMIT_METHOD_KEEP + and len(existing_full_source_ids) >= max_source_limit + and not edges_data + and already_description + ) + # Process edges_data with None checks weight = sum([dp["weight"] for dp in edges_data] + already_weights) # Deduplicate by description, keeping first occurrence unique_edges = {} for dp in edges_data: - if dp.get("description"): - desc = dp["description"] - if desc not in unique_edges: - unique_edges[desc] = dp + description_value = dp.get("description") + if not description_value: + continue + if description_value not in unique_edges: + unique_edges[description_value] = dp # Sort description by timestamp, then by description length (largest to smallest) when timestamps are the same sorted_edges = sorted( @@ -1466,17 +1742,34 @@ async def _merge_edges_then_upsert( ) sorted_descriptions = [dp["description"] for dp in sorted_edges] + truncation_info = "" + dd_message = "" + # Combine already_description with sorted new descriptions description_list = already_description + sorted_descriptions + deduplicated_num = original_edges_count - len(sorted_descriptions) + if deduplicated_num > 0: + dd_message = f"dd:{deduplicated_num}" num_fragment = len(description_list) already_fragment = len(already_description) - deduplicated_num = already_fragment + len(edges_data) - num_fragment - if deduplicated_num > 0: - dd_message = f"(dd:{deduplicated_num})" - else: - dd_message = "" - if num_fragment > 0: + + if skip_summary_due_to_limit: + description = ( + already_edge.get("description", "(no description)") + if already_edge + else "(no description)" + ) + llm_was_used = False + status_message = ( + f"Skip merge for `{src_id}`~`{tgt_id}`: IGNORE_NEW limit reached" + ) + logger.debug(status_message) + if pipeline_status is not None and pipeline_status_lock is not None: + async with pipeline_status_lock: + pipeline_status["latest_message"] = status_message + pipeline_status["history_messages"].append(status_message) + elif num_fragment > 0: # Get summary and LLM usage status description, llm_was_used = await _handle_entity_relation_summary( "Relation", @@ -1489,9 +1782,16 @@ async def _merge_edges_then_upsert( # Log based on actual LLM usage if llm_was_used: - status_message = f"LLMmrg: `{src_id}`~`{tgt_id}` | {already_fragment}+{num_fragment - already_fragment}{dd_message}" + status_message = f"LLMmrg: `{src_id}`~`{tgt_id}` | {already_fragment}+{num_fragment - already_fragment}" else: - status_message = f"Merged: `{src_id}`~`{tgt_id}` | {already_fragment}+{num_fragment - already_fragment}{dd_message}" + status_message = f"Merged: `{src_id}`~`{tgt_id}` | {already_fragment}+{num_fragment - already_fragment}" + + # Add truncation info from apply_source_ids_limit if truncation occurred + if len(source_ids) < len(full_source_ids): + truncation_info = f"{limit_method}:{len(source_ids)}/{len(full_source_ids)}" + + if dd_message or truncation_info: + status_message += f"({','.join([truncation_info, dd_message])})" if already_fragment > 0 or llm_was_used: logger.info(status_message) @@ -1521,12 +1821,7 @@ async def _merge_edges_then_upsert( # Join all unique keywords with commas keywords = ",".join(sorted(all_keywords)) - source_id = GRAPH_FIELD_SEP.join( - set( - [dp["source_id"] for dp in edges_data if dp.get("source_id")] - + already_source_ids - ) - ) + source_id = GRAPH_FIELD_SEP.join(source_ids) file_path = build_file_path(already_file_paths, edges_data, f"{src_id}-{tgt_id}") for need_insert_id in [src_id, tgt_id]: @@ -1538,6 +1833,7 @@ async def _merge_edges_then_upsert( "entity_type": "UNKNOWN", "file_path": file_path, "created_at": int(time.time()), + "truncate": "", } await knowledge_graph_inst.upsert_node(need_insert_id, node_data=node_data) @@ -1563,6 +1859,7 @@ async def _merge_edges_then_upsert( source_id=source_id, file_path=file_path, created_at=int(time.time()), + truncate=truncation_info, ), ) @@ -1574,6 +1871,7 @@ async def _merge_edges_then_upsert( source_id=source_id, file_path=file_path, created_at=int(time.time()), + truncate=truncation_info, ) return edge_data @@ -1591,6 +1889,8 @@ async def merge_nodes_and_edges( pipeline_status: dict = None, pipeline_status_lock=None, llm_response_cache: BaseKVStorage | None = None, + entity_chunks_storage: BaseKVStorage | None = None, + relation_chunks_storage: BaseKVStorage | None = None, current_file_number: int = 0, total_files: int = 0, file_path: str = "unknown_source", @@ -1614,6 +1914,8 @@ async def merge_nodes_and_edges( pipeline_status: Pipeline status dictionary pipeline_status_lock: Lock for pipeline status llm_response_cache: LLM response cache + entity_chunks_storage: Storage tracking full chunk lists per entity + relation_chunks_storage: Storage tracking full chunk lists per relation current_file_number: Current file number for logging total_files: Total files for logging file_path: File path for logging @@ -1671,6 +1973,7 @@ async def merge_nodes_and_edges( pipeline_status, pipeline_status_lock, llm_response_cache, + entity_chunks_storage, ) # Vector database operation (equally critical, must succeed) @@ -1689,7 +1992,6 @@ async def merge_nodes_and_edges( } } - logger.debug(f"Inserting {entity_name} in Graph") # Use safe operation wrapper - VDB failure must throw exception await safe_vdb_operation_with_exception( @@ -1804,6 +2106,7 @@ async def merge_nodes_and_edges( pipeline_status_lock, llm_response_cache, added_entities, # Pass list to collect added entities + relation_chunks_storage, ) if edge_data is None: diff --git a/lightrag/utils.py b/lightrag/utils.py index 959607e5..6805227e 100644 --- a/lightrag/utils.py +++ b/lightrag/utils.py @@ -15,7 +15,17 @@ from dataclasses import dataclass from datetime import datetime from functools import wraps from hashlib import md5 -from typing import Any, Protocol, Callable, TYPE_CHECKING, List, Optional +from typing import ( + Any, + Protocol, + Callable, + TYPE_CHECKING, + List, + Optional, + Iterable, + Sequence, + Collection, +) import numpy as np from dotenv import load_dotenv @@ -26,6 +36,9 @@ from lightrag.constants import ( GRAPH_FIELD_SEP, DEFAULT_MAX_TOTAL_TOKENS, DEFAULT_MAX_FILE_PATH_LENGTH, + DEFAULT_SOURCE_IDS_LIMIT_METHOD, + VALID_SOURCE_IDS_LIMIT_METHODS, + SOURCE_IDS_LIMIT_METHOD_FIFO, ) # Initialize logger with basic configuration @@ -2464,24 +2477,112 @@ async def process_chunks_unified( return final_chunks -def truncate_entity_source_id(chunk_ids: set, entity_name: str, global_config: dict) -> set: - """Limit chunk_ids, for entities that appear a HUGE no of times (To not break VDB hard upper limits)""" - already_len: int = len(chunk_ids) - max_chunk_ids_per_entity = global_config["max_source_ids_per_entity"] +def normalize_source_ids_limit_method(method: str | None) -> str: + """Normalize the source ID limiting strategy and fall back to default when invalid.""" - if already_len <= max_chunk_ids_per_entity: - return chunk_ids + if not method: + return DEFAULT_SOURCE_IDS_LIMIT_METHOD - logger.warning( - f"Source Ids already exceeds {max_chunk_ids_per_entity } for {entity_name}, " - f"current size: {already_len}, truncating..." - ) + normalized = method.upper() + if normalized not in VALID_SOURCE_IDS_LIMIT_METHODS: + logger.warning( + "Unknown SOURCE_IDS_LIMIT_METHOD '%s', falling back to %s", + method, + DEFAULT_SOURCE_IDS_LIMIT_METHOD, + ) + return DEFAULT_SOURCE_IDS_LIMIT_METHOD - truncated_chunk_ids = set(list(chunk_ids)[0:max_chunk_ids_per_entity ]) - return truncated_chunk_ids + return normalized +def merge_source_ids( + existing_ids: Iterable[str] | None, new_ids: Iterable[str] | None +) -> list[str]: + """Merge two iterables of source IDs while preserving order and removing duplicates.""" + + merged: list[str] = [] + seen: set[str] = set() + + for sequence in (existing_ids, new_ids): + if not sequence: + continue + for source_id in sequence: + if not source_id: + continue + if source_id not in seen: + seen.add(source_id) + merged.append(source_id) + + return merged + + +def apply_source_ids_limit( + source_ids: Sequence[str], + limit: int, + method: str, + *, + identifier: str | None = None, +) -> list[str]: + """Apply a limit strategy to a sequence of source IDs.""" + + if limit <= 0: + return [] + + source_ids_list = list(source_ids) + if len(source_ids_list) <= limit: + return source_ids_list + + normalized_method = normalize_source_ids_limit_method(method) + + if normalized_method == SOURCE_IDS_LIMIT_METHOD_FIFO: + truncated = source_ids_list[-limit:] + else: # IGNORE_NEW + truncated = source_ids_list[:limit] + + if identifier and len(truncated) < len(source_ids_list): + logger.debug( + "Source_id truncated: %s | %s keeping %s of %s entries", + identifier, + normalized_method, + len(truncated), + len(source_ids_list), + ) + + return truncated + + +def subtract_source_ids( + source_ids: Iterable[str], + ids_to_remove: Collection[str], +) -> list[str]: + """Remove a collection of IDs from an ordered iterable while preserving order.""" + + removal_set = set(ids_to_remove) + if not removal_set: + return [source_id for source_id in source_ids if source_id] + + return [ + source_id + for source_id in source_ids + if source_id and source_id not in removal_set + ] + + +def make_relation_chunk_key(src: str, tgt: str) -> str: + """Create a deterministic storage key for relation chunk tracking.""" + + return GRAPH_FIELD_SEP.join(sorted((src, tgt))) + + +def parse_relation_chunk_key(key: str) -> tuple[str, str]: + """Parse a relation chunk storage key back into its entity pair.""" + + parts = key.split(GRAPH_FIELD_SEP) + if len(parts) != 2: + raise ValueError(f"Invalid relation chunk key: {key}") + return parts[0], parts[1] + def build_file_path(already_file_paths, data_list, target): """Build file path string with UTF-8 byte length limit and deduplication From a9fec26798042c44e98f3700d84aa81f4acd90b3 Mon Sep 17 00:00:00 2001 From: yangdx Date: Mon, 20 Oct 2025 20:12:53 +0800 Subject: [PATCH 08/25] Add file path limit configuration for entities and relations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit • Add MAX_FILE_PATHS env variable • Implement file path count limiting • Support KEEP/FIFO strategies • Add truncation placeholder • Remove old build_file_path function --- env.example | 3 + lightrag/constants.py | 28 +++--- lightrag/lightrag.py | 10 ++ lightrag/operate.py | 213 ++++++++++++++++++++++++++++++++++++++---- lightrag/utils.py | 60 ------------ 5 files changed, 224 insertions(+), 90 deletions(-) diff --git a/env.example b/env.example index 6d53c390..3529cf58 100644 --- a/env.example +++ b/env.example @@ -73,11 +73,14 @@ ENABLE_LLM_CACHE=true # MAX_RELATION_TOKENS=8000 ### control the maximum tokens send to LLM (include entities, relations and chunks) # MAX_TOTAL_TOKENS=30000 + ### control the maximum chunk_ids stored in vector and graph db # MAX_SOURCE_IDS_PER_ENTITY=300 # MAX_SOURCE_IDS_PER_RELATION=300 ### control chunk_ids limitation method: KEEP, FIFO (KEPP: Ingore New Chunks, FIFO: New chunks replace old chunks) # SOURCE_IDS_LIMIT_METHOD=KEEP +### Maximum number of file paths stored in entity/relation file_path field +# MAX_FILE_PATHS=30 ### maximum number of related chunks per source entity or relation ### The chunk picker uses this value to determine the total number of chunks selected from KG(knowledge graph) diff --git a/lightrag/constants.py b/lightrag/constants.py index e374a991..62ca1888 100644 --- a/lightrag/constants.py +++ b/lightrag/constants.py @@ -14,16 +14,6 @@ DEFAULT_MAX_GRAPH_NODES = 1000 DEFAULT_SUMMARY_LANGUAGE = "English" # Default language for document processing DEFAULT_MAX_GLEANING = 1 -DEFAULT_MAX_SOURCE_IDS_PER_ENTITY = 3 -DEFAULT_MAX_SOURCE_IDS_PER_RELATION = 3 -SOURCE_IDS_LIMIT_METHOD_KEEP = "KEEP" -SOURCE_IDS_LIMIT_METHOD_FIFO = "FIFO" -DEFAULT_SOURCE_IDS_LIMIT_METHOD = SOURCE_IDS_LIMIT_METHOD_KEEP -VALID_SOURCE_IDS_LIMIT_METHODS = { - SOURCE_IDS_LIMIT_METHOD_KEEP, - SOURCE_IDS_LIMIT_METHOD_FIFO, -} - # Number of description fragments to trigger LLM summary DEFAULT_FORCE_LLM_SUMMARY_ON_MERGE = 8 # Max description token size to trigger LLM summary @@ -67,8 +57,24 @@ DEFAULT_HISTORY_TURNS = 0 DEFAULT_MIN_RERANK_SCORE = 0.0 DEFAULT_RERANK_BINDING = "null" -# File path configuration for vector and graph database(Should not be changed, used in Milvus Schema) +# Default source ids limit in meta data for entity and relation +DEFAULT_MAX_SOURCE_IDS_PER_ENTITY = 3 +DEFAULT_MAX_SOURCE_IDS_PER_RELATION = 3 +SOURCE_IDS_LIMIT_METHOD_KEEP = "KEEP" +SOURCE_IDS_LIMIT_METHOD_FIFO = "FIFO" +DEFAULT_SOURCE_IDS_LIMIT_METHOD = SOURCE_IDS_LIMIT_METHOD_KEEP +VALID_SOURCE_IDS_LIMIT_METHODS = { + SOURCE_IDS_LIMIT_METHOD_KEEP, + SOURCE_IDS_LIMIT_METHOD_FIFO, +} +# Default file_path limit in meta data for entity and relation +DEFAULT_MAX_FILE_PATHS = 2 + +# Field length of file_path in Milvus Schema for entity and relation (Should not be changed) +# file_path must store all file paths up to the DEFAULT_MAX_FILE_PATHS limit within the metadata. DEFAULT_MAX_FILE_PATH_LENGTH = 32768 +# Placeholder for more file paths in meta data for entity and relation (Should not be changed) +DEFAULT_FILE_PATH_MORE_PLACEHOLDER = "truncated" # Default temperature for LLM DEFAULT_TEMPERATURE = 1.0 diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 1f32da50..4380a276 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -47,6 +47,8 @@ from lightrag.constants import ( DEFAULT_LLM_TIMEOUT, DEFAULT_EMBEDDING_TIMEOUT, DEFAULT_SOURCE_IDS_LIMIT_METHOD, + DEFAULT_MAX_FILE_PATHS, + DEFAULT_FILE_PATH_MORE_PLACEHOLDER, ) from lightrag.utils import get_env_value @@ -393,6 +395,14 @@ class LightRAG: ) """Strategy for enforcing source_id limits: IGNORE_NEW or FIFO.""" + max_file_paths: int = field( + default=get_env_value("MAX_FILE_PATHS", DEFAULT_MAX_FILE_PATHS, int) + ) + """Maximum number of file paths to store in entity/relation file_path field.""" + + file_path_more_placeholder: str = field(default=DEFAULT_FILE_PATH_MORE_PLACEHOLDER) + """Placeholder text when file paths exceed max_file_paths limit.""" + addon_params: dict[str, Any] = field( default_factory=lambda: { "language": get_env_value( diff --git a/lightrag/operate.py b/lightrag/operate.py index 2f7f6340..6b409f21 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -26,7 +26,6 @@ from lightrag.utils import ( pick_by_weighted_polling, pick_by_vector_similarity, process_chunks_unified, - build_file_path, safe_vdb_operation_with_exception, create_prefixed_exception, fix_tuple_delimiter_corruption, @@ -56,6 +55,8 @@ from lightrag.constants import ( DEFAULT_ENTITY_TYPES, DEFAULT_SUMMARY_LANGUAGE, SOURCE_IDS_LIMIT_METHOD_KEEP, + SOURCE_IDS_LIMIT_METHOD_FIFO, + DEFAULT_FILE_PATH_MORE_PLACEHOLDER, ) from lightrag.kg.shared_storage import get_storage_keyed_lock import time @@ -1156,7 +1157,8 @@ async def _rebuild_single_entity( # Process cached entity data descriptions = [] entity_types = [] - file_paths = set() + file_paths_list = [] + seen_paths = set() for entity_data in all_entity_data: if entity_data.get("description"): @@ -1164,7 +1166,35 @@ async def _rebuild_single_entity( if entity_data.get("entity_type"): entity_types.append(entity_data["entity_type"]) if entity_data.get("file_path"): - file_paths.add(entity_data["file_path"]) + file_path = entity_data["file_path"] + if file_path and file_path not in seen_paths: + file_paths_list.append(file_path) + seen_paths.add(file_path) + + # Apply MAX_FILE_PATHS limit + max_file_paths = global_config.get("max_file_paths") + file_path_placeholder = global_config.get( + "file_path_more_placeholder", DEFAULT_FILE_PATH_MORE_PLACEHOLDER + ) + limit_method = global_config.get("source_ids_limit_method") + + original_count = len(file_paths_list) + if original_count > max_file_paths: + if limit_method == SOURCE_IDS_LIMIT_METHOD_FIFO: + # FIFO: keep tail (newest), discard head + file_paths_list = file_paths_list[-max_file_paths:] + else: + # KEEP: keep head (earliest), discard tail + file_paths_list = file_paths_list[:max_file_paths] + + file_paths_list.append( + f"...{file_path_placeholder}(showing {max_file_paths} of {original_count})..." + ) + logger.info( + f"Limited `{entity_name}`: file_path {original_count} -> {max_file_paths} ({limit_method})" + ) + + file_paths = set(file_paths_list) # Remove duplicates while preserving order description_list = list(dict.fromkeys(descriptions)) @@ -1284,7 +1314,8 @@ async def _rebuild_single_relationship( descriptions = [] keywords = [] weights = [] - file_paths = set() + file_paths_list = [] + seen_paths = set() for rel_data in all_relationship_data: if rel_data.get("description"): @@ -1294,7 +1325,35 @@ async def _rebuild_single_relationship( if rel_data.get("weight"): weights.append(rel_data["weight"]) if rel_data.get("file_path"): - file_paths.add(rel_data["file_path"]) + file_path = rel_data["file_path"] + if file_path and file_path not in seen_paths: + file_paths_list.append(file_path) + seen_paths.add(file_path) + + # Apply count limit + max_file_paths = global_config.get("max_file_paths") + file_path_placeholder = global_config.get( + "file_path_more_placeholder", DEFAULT_FILE_PATH_MORE_PLACEHOLDER + ) + limit_method = global_config.get("source_ids_limit_method") + + original_count = len(file_paths_list) + if original_count > max_file_paths: + if limit_method == SOURCE_IDS_LIMIT_METHOD_FIFO: + # FIFO: keep tail (newest), discard head + file_paths_list = file_paths_list[-max_file_paths:] + else: + # KEEP: keep head (earliest), discard tail + file_paths_list = file_paths_list[:max_file_paths] + + file_paths_list.append( + f"...{file_path_placeholder}(showing {max_file_paths} of {original_count})..." + ) + logger.info( + f"Limited `{src}`~`{tgt}`: file_path {original_count} -> {max_file_paths} ({limit_method})" + ) + + file_paths = set(file_paths_list) # Remove duplicates while preserving order description_list = list(dict.fromkeys(descriptions)) @@ -1467,23 +1526,22 @@ async def _merge_nodes_then_upsert( } ) - limit_method = ( - global_config.get("source_ids_limit_method") or SOURCE_IDS_LIMIT_METHOD_KEEP - ) + limit_method = global_config.get("source_ids_limit_method") + max_source_limit = global_config.get("max_source_ids_per_entity") source_ids = apply_source_ids_limit( full_source_ids, - global_config["max_source_ids_per_entity"], + max_source_limit, limit_method, identifier=f"`{entity_name}`", ) - # Only apply filtering in IGNORE_NEW mode + # Only apply filtering in KEEP(ignore new) mode if limit_method == SOURCE_IDS_LIMIT_METHOD_KEEP: allowed_source_ids = set(source_ids) filtered_nodes = [] for dp in nodes_data: source_id = dp.get("source_id") - # Skip descriptions sourced from chunks dropped by the IGNORE_NEW cap + # Skip descriptions sourced from chunks dropped by the limitation cap if ( source_id and source_id not in allowed_source_ids @@ -1496,7 +1554,6 @@ async def _merge_nodes_then_upsert( # In FIFO mode, keep all node descriptions - truncation happens at source_ids level only nodes_data = list(nodes_data) - max_source_limit = global_config["max_source_ids_per_entity"] skip_summary_due_to_limit = ( limit_method == SOURCE_IDS_LIMIT_METHOD_KEEP and len(existing_full_source_ids) >= max_source_limit @@ -1566,7 +1623,7 @@ async def _merge_nodes_then_upsert( truncation_info = f"{limit_method}:{len(source_ids)}/{len(full_source_ids)}" if dd_message or truncation_info: - status_message += f"({','.join([truncation_info, dd_message])})" + status_message += f" ({', '.join([truncation_info, dd_message])})" if already_fragment > 0 or llm_was_used: logger.info(status_message) @@ -1583,7 +1640,65 @@ async def _merge_nodes_then_upsert( source_id = GRAPH_FIELD_SEP.join(source_ids) - file_path = build_file_path(already_file_paths, nodes_data, entity_name) + # Build file_path with count limit + if skip_summary_due_to_limit: + # Skip limit, keep original file_path + file_path = GRAPH_FIELD_SEP.join(fp for fp in already_file_paths if fp) + else: + # Collect and apply limit + file_paths_list = [] + seen_paths = set() + + # Get placeholder to filter it out + file_path_placeholder = global_config.get( + "file_path_more_placeholder", DEFAULT_FILE_PATH_MORE_PLACEHOLDER + ) + + # Collect from already_file_paths, excluding placeholder + for fp in already_file_paths: + # Skip placeholders (format: "...{placeholder}(showing X of Y)...") + if ( + fp + and not fp.startswith(f"...{file_path_placeholder}") + and fp not in seen_paths + ): + file_paths_list.append(fp) + seen_paths.add(fp) + + # Collect from new data + for dp in nodes_data: + file_path_item = dp.get("file_path") + if file_path_item and file_path_item not in seen_paths: + file_paths_list.append(file_path_item) + seen_paths.add(file_path_item) + + # Apply count limit + max_file_paths = global_config.get("max_file_paths") + + if len(file_paths_list) > max_file_paths: + limit_method = global_config.get( + "source_ids_limit_method", SOURCE_IDS_LIMIT_METHOD_KEEP + ) + file_path_placeholder = global_config.get( + "file_path_more_placeholder", DEFAULT_FILE_PATH_MORE_PLACEHOLDER + ) + original_count = len(file_paths_list) + + if limit_method == SOURCE_IDS_LIMIT_METHOD_FIFO: + # FIFO: keep tail (newest), discard head + file_paths_list = file_paths_list[-max_file_paths:] + else: + # KEEP: keep head (earliest), discard tail + file_paths_list = file_paths_list[:max_file_paths] + + file_paths_list.append( + f"...{file_path_placeholder}(showing {max_file_paths} of {original_count})..." + ) + logger.info( + f"Limited `{entity_name}`: file_path {original_count} -> {max_file_paths} ({limit_method})" + ) + + file_path = GRAPH_FIELD_SEP.join(file_paths_list) node_data = dict( entity_id=entity_name, @@ -1686,10 +1801,12 @@ async def _merge_edges_then_upsert( } ) + limit_method = global_config.get("source_ids_limit_method") + max_source_limit = global_config.get("max_source_ids_per_relation") source_ids = apply_source_ids_limit( full_source_ids, - global_config["max_source_ids_per_relation"], - global_config.get("source_ids_limit_method"), + max_source_limit, + limit_method, identifier=f"`{src_id}`~`{tgt_id}`", ) limit_method = ( @@ -1715,7 +1832,6 @@ async def _merge_edges_then_upsert( # In FIFO mode, keep all edge descriptions - truncation happens at source_ids level only edges_data = list(edges_data) - max_source_limit = global_config["max_source_ids_per_relation"] skip_summary_due_to_limit = ( limit_method == SOURCE_IDS_LIMIT_METHOD_KEEP and len(existing_full_source_ids) >= max_source_limit @@ -1791,7 +1907,7 @@ async def _merge_edges_then_upsert( truncation_info = f"{limit_method}:{len(source_ids)}/{len(full_source_ids)}" if dd_message or truncation_info: - status_message += f"({','.join([truncation_info, dd_message])})" + status_message += f" ({', '.join([truncation_info, dd_message])})" if already_fragment > 0 or llm_was_used: logger.info(status_message) @@ -1822,7 +1938,66 @@ async def _merge_edges_then_upsert( keywords = ",".join(sorted(all_keywords)) source_id = GRAPH_FIELD_SEP.join(source_ids) - file_path = build_file_path(already_file_paths, edges_data, f"{src_id}-{tgt_id}") + + # Build file_path with count limit + if skip_summary_due_to_limit: + # Skip limit, keep original file_path + file_path = GRAPH_FIELD_SEP.join(fp for fp in already_file_paths if fp) + else: + # Collect and apply limit + file_paths_list = [] + seen_paths = set() + + # Get placeholder to filter it out + file_path_placeholder = global_config.get( + "file_path_more_placeholder", DEFAULT_FILE_PATH_MORE_PLACEHOLDER + ) + + # Collect from already_file_paths, excluding placeholder + for fp in already_file_paths: + # Skip placeholders (format: "...{placeholder}(showing X of Y)...") + if ( + fp + and not fp.startswith(f"...{file_path_placeholder}") + and fp not in seen_paths + ): + file_paths_list.append(fp) + seen_paths.add(fp) + + # Collect from new data + for dp in edges_data: + file_path_item = dp.get("file_path") + if file_path_item and file_path_item not in seen_paths: + file_paths_list.append(file_path_item) + seen_paths.add(file_path_item) + + # Apply count limit + max_file_paths = global_config.get("max_file_paths") + + if len(file_paths_list) > max_file_paths: + limit_method = global_config.get( + "source_ids_limit_method", SOURCE_IDS_LIMIT_METHOD_KEEP + ) + file_path_placeholder = global_config.get( + "file_path_more_placeholder", DEFAULT_FILE_PATH_MORE_PLACEHOLDER + ) + original_count = len(file_paths_list) + + if limit_method == SOURCE_IDS_LIMIT_METHOD_FIFO: + # FIFO: keep tail (newest), discard head + file_paths_list = file_paths_list[-max_file_paths:] + else: + # KEEP: keep head (earliest), discard tail + file_paths_list = file_paths_list[:max_file_paths] + + file_paths_list.append( + f"...{file_path_placeholder}(showing {max_file_paths} of {original_count})..." + ) + logger.info( + f"Limited `{src_id}`~`{tgt_id}`: file_path {original_count} -> {max_file_paths} ({limit_method})" + ) + + file_path = GRAPH_FIELD_SEP.join(file_paths_list) for need_insert_id in [src_id, tgt_id]: if not (await knowledge_graph_inst.has_node(need_insert_id)): diff --git a/lightrag/utils.py b/lightrag/utils.py index 6805227e..bfa3cac4 100644 --- a/lightrag/utils.py +++ b/lightrag/utils.py @@ -35,7 +35,6 @@ from lightrag.constants import ( DEFAULT_LOG_FILENAME, GRAPH_FIELD_SEP, DEFAULT_MAX_TOTAL_TOKENS, - DEFAULT_MAX_FILE_PATH_LENGTH, DEFAULT_SOURCE_IDS_LIMIT_METHOD, VALID_SOURCE_IDS_LIMIT_METHODS, SOURCE_IDS_LIMIT_METHOD_FIFO, @@ -2584,65 +2583,6 @@ def parse_relation_chunk_key(key: str) -> tuple[str, str]: return parts[0], parts[1] -def build_file_path(already_file_paths, data_list, target): - """Build file path string with UTF-8 byte length limit and deduplication - - Args: - already_file_paths: List of existing file paths - data_list: List of data items containing file_path - target: Target name for logging warnings - - Returns: - str: Combined file paths separated by GRAPH_FIELD_SEP - """ - # set: deduplication - file_paths_set = {fp for fp in already_file_paths if fp} - - # string: filter empty value and keep file order in already_file_paths - file_paths = GRAPH_FIELD_SEP.join(fp for fp in already_file_paths if fp) - - # Check if initial file_paths already exceeds byte length limit - if len(file_paths.encode("utf-8")) >= DEFAULT_MAX_FILE_PATH_LENGTH: - logger.warning( - f"Initial file_paths already exceeds {DEFAULT_MAX_FILE_PATH_LENGTH} bytes for {target}, " - f"current size: {len(file_paths.encode('utf-8'))} bytes" - ) - - # ignored file_paths - file_paths_ignore = "" - # add file_paths - for dp in data_list: - cur_file_path = dp.get("file_path") - # empty - if not cur_file_path: - continue - - # skip duplicate item - if cur_file_path in file_paths_set: - continue - # add - file_paths_set.add(cur_file_path) - - # check the UTF-8 byte length - new_addition = GRAPH_FIELD_SEP + cur_file_path if file_paths else cur_file_path - if ( - len(file_paths.encode("utf-8")) + len(new_addition.encode("utf-8")) - < DEFAULT_MAX_FILE_PATH_LENGTH - 5 - ): - # append - file_paths += new_addition - else: - # ignore - file_paths_ignore += GRAPH_FIELD_SEP + cur_file_path - - if file_paths_ignore: - logger.warning( - f"File paths exceed {DEFAULT_MAX_FILE_PATH_LENGTH} bytes for {target}, " - f"ignoring file path: {file_paths_ignore}" - ) - return file_paths - - def generate_track_id(prefix: str = "upload") -> str: """Generate a unique tracking ID with timestamp and UUID From e0fd31a60d5e346b4cd9566d114789fab915fcbd Mon Sep 17 00:00:00 2001 From: yangdx Date: Mon, 20 Oct 2025 22:09:09 +0800 Subject: [PATCH 09/25] Fix logging message formatting --- env.example | 26 +++++++++++++------------- lightrag/constants.py | 6 +++--- lightrag/operate.py | 16 ++++++++++------ 3 files changed, 26 insertions(+), 22 deletions(-) diff --git a/env.example b/env.example index 3529cf58..73f2d7b7 100644 --- a/env.example +++ b/env.example @@ -74,19 +74,6 @@ ENABLE_LLM_CACHE=true ### control the maximum tokens send to LLM (include entities, relations and chunks) # MAX_TOTAL_TOKENS=30000 -### control the maximum chunk_ids stored in vector and graph db -# MAX_SOURCE_IDS_PER_ENTITY=300 -# MAX_SOURCE_IDS_PER_RELATION=300 -### control chunk_ids limitation method: KEEP, FIFO (KEPP: Ingore New Chunks, FIFO: New chunks replace old chunks) -# SOURCE_IDS_LIMIT_METHOD=KEEP -### Maximum number of file paths stored in entity/relation file_path field -# MAX_FILE_PATHS=30 - -### maximum number of related chunks per source entity or relation -### The chunk picker uses this value to determine the total number of chunks selected from KG(knowledge graph) -### Higher values increase re-ranking time -# RELATED_CHUNK_NUMBER=5 - ### chunk selection strategies ### VECTOR: Pick KG chunks by vector similarity, delivered chunks to the LLM aligning more closely with naive retrieval ### WEIGHT: Pick KG chunks by entity and chunk weight, delivered more solely KG related chunks to the LLM @@ -148,6 +135,19 @@ SUMMARY_LANGUAGE=English ### Maximum context size sent to LLM for description summary # SUMMARY_CONTEXT_SIZE=12000 +### control the maximum chunk_ids stored in vector and graph db +# MAX_SOURCE_IDS_PER_ENTITY=300 +# MAX_SOURCE_IDS_PER_RELATION=300 +### control chunk_ids limitation method: KEEP, FIFO (KEEP: Keep oldest, FIFO: First in first out) +# SOURCE_IDS_LIMIT_METHOD=KEEP +### Maximum number of file paths stored in entity/relation file_path field +# MAX_FILE_PATHS=30 + +### maximum number of related chunks per source entity or relation +### The chunk picker uses this value to determine the total number of chunks selected from KG(knowledge graph) +### Higher values increase re-ranking time +# RELATED_CHUNK_NUMBER=5 + ############################### ### Concurrency Configuration ############################### diff --git a/lightrag/constants.py b/lightrag/constants.py index 62ca1888..ad12cccf 100644 --- a/lightrag/constants.py +++ b/lightrag/constants.py @@ -60,14 +60,14 @@ DEFAULT_RERANK_BINDING = "null" # Default source ids limit in meta data for entity and relation DEFAULT_MAX_SOURCE_IDS_PER_ENTITY = 3 DEFAULT_MAX_SOURCE_IDS_PER_RELATION = 3 -SOURCE_IDS_LIMIT_METHOD_KEEP = "KEEP" -SOURCE_IDS_LIMIT_METHOD_FIFO = "FIFO" +SOURCE_IDS_LIMIT_METHOD_KEEP = "KEEP" # Keep oldest +SOURCE_IDS_LIMIT_METHOD_FIFO = "FIFO" # First In First Out (Keep newest) DEFAULT_SOURCE_IDS_LIMIT_METHOD = SOURCE_IDS_LIMIT_METHOD_KEEP VALID_SOURCE_IDS_LIMIT_METHODS = { SOURCE_IDS_LIMIT_METHOD_KEEP, SOURCE_IDS_LIMIT_METHOD_FIFO, } -# Default file_path limit in meta data for entity and relation +# Default file_path limit in meta data for entity and relation (Use same limit method as source_ids) DEFAULT_MAX_FILE_PATHS = 2 # Field length of file_path in Milvus Schema for entity and relation (Should not be changed) diff --git a/lightrag/operate.py b/lightrag/operate.py index 6b409f21..3e889eb7 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -1188,7 +1188,7 @@ async def _rebuild_single_entity( file_paths_list = file_paths_list[:max_file_paths] file_paths_list.append( - f"...{file_path_placeholder}(showing {max_file_paths} of {original_count})..." + f"...{file_path_placeholder}({limit_method} {max_file_paths}/{original_count})..." ) logger.info( f"Limited `{entity_name}`: file_path {original_count} -> {max_file_paths} ({limit_method})" @@ -1347,7 +1347,7 @@ async def _rebuild_single_relationship( file_paths_list = file_paths_list[:max_file_paths] file_paths_list.append( - f"...{file_path_placeholder}(showing {max_file_paths} of {original_count})..." + f"...{file_path_placeholder}({limit_method} {max_file_paths}/{original_count})..." ) logger.info( f"Limited `{src}`~`{tgt}`: file_path {original_count} -> {max_file_paths} ({limit_method})" @@ -1623,7 +1623,9 @@ async def _merge_nodes_then_upsert( truncation_info = f"{limit_method}:{len(source_ids)}/{len(full_source_ids)}" if dd_message or truncation_info: - status_message += f" ({', '.join([truncation_info, dd_message])})" + status_message += ( + f" ({', '.join(filter(None, [truncation_info, dd_message]))})" + ) if already_fragment > 0 or llm_was_used: logger.info(status_message) @@ -1692,7 +1694,7 @@ async def _merge_nodes_then_upsert( file_paths_list = file_paths_list[:max_file_paths] file_paths_list.append( - f"...{file_path_placeholder}(showing {max_file_paths} of {original_count})..." + f"...{file_path_placeholder}({limit_method} {max_file_paths}/{original_count})..." ) logger.info( f"Limited `{entity_name}`: file_path {original_count} -> {max_file_paths} ({limit_method})" @@ -1907,7 +1909,9 @@ async def _merge_edges_then_upsert( truncation_info = f"{limit_method}:{len(source_ids)}/{len(full_source_ids)}" if dd_message or truncation_info: - status_message += f" ({', '.join([truncation_info, dd_message])})" + status_message += ( + f" ({', '.join(filter(None, [truncation_info, dd_message]))})" + ) if already_fragment > 0 or llm_was_used: logger.info(status_message) @@ -1991,7 +1995,7 @@ async def _merge_edges_then_upsert( file_paths_list = file_paths_list[:max_file_paths] file_paths_list.append( - f"...{file_path_placeholder}(showing {max_file_paths} of {original_count})..." + f"...{file_path_placeholder}({limit_method} {max_file_paths}/{original_count})..." ) logger.info( f"Limited `{src_id}`~`{tgt_id}`: file_path {original_count} -> {max_file_paths} ({limit_method})" From 637b850ec55593b67566f3514f4d37edc84979d1 Mon Sep 17 00:00:00 2001 From: yangdx Date: Mon, 20 Oct 2025 23:03:01 +0800 Subject: [PATCH 10/25] Add truncation indicator and update property labels in graph view MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit • Add truncate tooltip to source_id field • Add visual truncation indicator (†) • Bump API version to 0242 --- lightrag/api/__init__.py | 2 +- .../src/components/graph/PropertiesView.tsx | 22 ++++++++++++++----- lightrag_webui/src/locales/ar.json | 6 ++--- lightrag_webui/src/locales/en.json | 4 ++-- lightrag_webui/src/locales/fr.json | 4 ++-- lightrag_webui/src/locales/zh.json | 4 ++-- lightrag_webui/src/locales/zh_TW.json | 4 ++-- 7 files changed, 29 insertions(+), 17 deletions(-) diff --git a/lightrag/api/__init__.py b/lightrag/api/__init__.py index 822818a6..e1baefb9 100644 --- a/lightrag/api/__init__.py +++ b/lightrag/api/__init__.py @@ -1 +1 @@ -__api_version__ = "0241" +__api_version__ = "0242" diff --git a/lightrag_webui/src/components/graph/PropertiesView.tsx b/lightrag_webui/src/components/graph/PropertiesView.tsx index 3ebdfd29..b46eb5b4 100644 --- a/lightrag_webui/src/components/graph/PropertiesView.tsx +++ b/lightrag_webui/src/components/graph/PropertiesView.tsx @@ -183,7 +183,8 @@ const PropertyRow = ({ entityType, sourceId, targetId, - isEditable = false + isEditable = false, + truncate }: { name: string value: any @@ -197,6 +198,7 @@ const PropertyRow = ({ sourceId?: string targetId?: string isEditable?: boolean + truncate?: string }) => { const { t } = useTranslation() @@ -216,7 +218,12 @@ const PropertyRow = ({ // Format the value to convert to newlines const formattedValue = formatValueWithSeparators(value) - const formattedTooltip = tooltip || formatValueWithSeparators(value) + let formattedTooltip = tooltip || formatValueWithSeparators(value) + + // If this is source_id field and truncate info exists, append it to the tooltip + if (name === 'source_id' && truncate) { + formattedTooltip += `\n(${truncate} truncated)` + } // Use EditablePropertyRow for editable fields (description, entity_id and keywords) if (isEditable && (name === 'description' || name === 'entity_id' || name === 'keywords')) { @@ -241,7 +248,10 @@ const PropertyRow = ({ // For non-editable fields, use the regular Text component return (
- {getPropertyNameTranslation(name)}: + + {getPropertyNameTranslation(name)} + {name === 'source_id' && truncate && } + : { {Object.keys(node.properties) .sort() .map((name) => { - if (name === 'created_at') return null; // Hide created_at property + if (name === 'created_at' || name === 'truncate') return null; // Hide created_at and truncate properties return ( { entityId={node.properties['entity_id']} entityType="node" isEditable={name === 'description' || name === 'entity_id'} + truncate={node.properties['truncate']} /> ) })} @@ -373,7 +384,7 @@ const EdgePropertiesView = ({ edge }: { edge: EdgeType }) => { {Object.keys(edge.properties) .sort() .map((name) => { - if (name === 'created_at') return null; // Hide created_at property + if (name === 'created_at' || name === 'truncate') return null; // Hide created_at and truncate properties return ( { sourceId={edge.sourceNode?.properties['entity_id'] || edge.source} targetId={edge.targetNode?.properties['entity_id'] || edge.target} isEditable={name === 'description' || name === 'keywords'} + truncate={edge.properties['truncate']} /> ) })} diff --git a/lightrag_webui/src/locales/ar.json b/lightrag_webui/src/locales/ar.json index fb5e84bb..6f2703ca 100644 --- a/lightrag_webui/src/locales/ar.json +++ b/lightrag_webui/src/locales/ar.json @@ -318,10 +318,10 @@ "description": "الوصف", "entity_id": "الاسم", "entity_type": "النوع", - "source_id": "معرف المصدر", + "source_id": "C-ID", "Neighbour": "الجار", - "file_path": "المصدر", - "keywords": "الكلمات الرئيسية", + "file_path": "File", + "keywords": "Keyword", "weight": "الوزن" } }, diff --git a/lightrag_webui/src/locales/en.json b/lightrag_webui/src/locales/en.json index 3f4d04a9..418ac296 100644 --- a/lightrag_webui/src/locales/en.json +++ b/lightrag_webui/src/locales/en.json @@ -318,9 +318,9 @@ "description": "Description", "entity_id": "Name", "entity_type": "Type", - "source_id": "SrcID", + "source_id": "C-ID", "Neighbour": "Neigh", - "file_path": "Source", + "file_path": "File", "keywords": "Keys", "weight": "Weight" } diff --git a/lightrag_webui/src/locales/fr.json b/lightrag_webui/src/locales/fr.json index 9104d34c..463f05eb 100644 --- a/lightrag_webui/src/locales/fr.json +++ b/lightrag_webui/src/locales/fr.json @@ -318,9 +318,9 @@ "description": "Description", "entity_id": "Nom", "entity_type": "Type", - "source_id": "ID source", + "source_id": "C-ID", "Neighbour": "Voisin", - "file_path": "Source", + "file_path": "File", "keywords": "Keys", "weight": "Poids" } diff --git a/lightrag_webui/src/locales/zh.json b/lightrag_webui/src/locales/zh.json index e6f87e18..40d8cdb0 100644 --- a/lightrag_webui/src/locales/zh.json +++ b/lightrag_webui/src/locales/zh.json @@ -318,9 +318,9 @@ "description": "描述", "entity_id": "名称", "entity_type": "类型", - "source_id": "信源ID", + "source_id": "C-ID", "Neighbour": "邻接", - "file_path": "信源", + "file_path": "文件", "keywords": "Keys", "weight": "权重" } diff --git a/lightrag_webui/src/locales/zh_TW.json b/lightrag_webui/src/locales/zh_TW.json index 003ce313..5ea179c2 100644 --- a/lightrag_webui/src/locales/zh_TW.json +++ b/lightrag_webui/src/locales/zh_TW.json @@ -318,9 +318,9 @@ "description": "描述", "entity_id": "名稱", "entity_type": "類型", - "source_id": "來源ID", + "source_id": "C-ID", "Neighbour": "鄰接", - "file_path": "來源", + "file_path": "檔案", "keywords": "Keys", "weight": "權重" } From e01c998ee92d930a689fa8655227d4a632d2615b Mon Sep 17 00:00:00 2001 From: yangdx Date: Mon, 20 Oct 2025 23:48:04 +0800 Subject: [PATCH 11/25] Track placeholders in file paths for accurate source count display MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit • Add has_placeholder tracking variable • Detect placeholder patterns in paths • Show + sign for truncated counts --- lightrag/operate.py | 44 ++++++++++++++++++++++++++++++++------------ 1 file changed, 32 insertions(+), 12 deletions(-) diff --git a/lightrag/operate.py b/lightrag/operate.py index 3e889eb7..a5e168be 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -1579,6 +1579,7 @@ async def _merge_nodes_then_upsert( truncation_info = "" dd_message = "" + has_placeholder = False # Initialize to track placeholder in file paths # Combine already_description with sorted new sorted descriptions description_list = already_description + sorted_descriptions @@ -1620,7 +1621,15 @@ async def _merge_nodes_then_upsert( # Add truncation info from apply_source_ids_limit if truncation occurred if len(source_ids) < len(full_source_ids): - truncation_info = f"{limit_method}:{len(source_ids)}/{len(full_source_ids)}" + # Add + sign if has_placeholder is True, indicating actual file count is higher + full_source_count_str = ( + f"{len(full_source_ids)}+" + if has_placeholder + else str(len(full_source_ids)) + ) + truncation_info = ( + f"{limit_method}:{len(source_ids)}/{full_source_count_str}" + ) if dd_message or truncation_info: status_message += ( @@ -1650,6 +1659,7 @@ async def _merge_nodes_then_upsert( # Collect and apply limit file_paths_list = [] seen_paths = set() + has_placeholder = False # Track if already_file_paths contains placeholder # Get placeholder to filter it out file_path_placeholder = global_config.get( @@ -1658,12 +1668,12 @@ async def _merge_nodes_then_upsert( # Collect from already_file_paths, excluding placeholder for fp in already_file_paths: + # Check if this is a placeholder record + if fp and fp.startswith(f"...{file_path_placeholder}"): + has_placeholder = True + continue # Skip placeholders (format: "...{placeholder}(showing X of Y)...") - if ( - fp - and not fp.startswith(f"...{file_path_placeholder}") - and fp not in seen_paths - ): + if fp and fp not in seen_paths: file_paths_list.append(fp) seen_paths.add(fp) @@ -1862,6 +1872,7 @@ async def _merge_edges_then_upsert( truncation_info = "" dd_message = "" + has_placeholder = False # Initialize to track placeholder in file paths # Combine already_description with sorted new descriptions description_list = already_description + sorted_descriptions @@ -1906,7 +1917,15 @@ async def _merge_edges_then_upsert( # Add truncation info from apply_source_ids_limit if truncation occurred if len(source_ids) < len(full_source_ids): - truncation_info = f"{limit_method}:{len(source_ids)}/{len(full_source_ids)}" + # Add + sign if has_placeholder is True, indicating actual file count is higher + full_source_count_str = ( + f"{len(full_source_ids)}+" + if has_placeholder + else str(len(full_source_ids)) + ) + truncation_info = ( + f"{limit_method}:{len(source_ids)}/{full_source_count_str}" + ) if dd_message or truncation_info: status_message += ( @@ -1951,6 +1970,7 @@ async def _merge_edges_then_upsert( # Collect and apply limit file_paths_list = [] seen_paths = set() + has_placeholder = False # Track if already_file_paths contains placeholder # Get placeholder to filter it out file_path_placeholder = global_config.get( @@ -1959,12 +1979,12 @@ async def _merge_edges_then_upsert( # Collect from already_file_paths, excluding placeholder for fp in already_file_paths: + # Check if this is a placeholder record + if fp and fp.startswith(f"...{file_path_placeholder}"): + has_placeholder = True + continue # Skip placeholders (format: "...{placeholder}(showing X of Y)...") - if ( - fp - and not fp.startswith(f"...{file_path_placeholder}") - and fp not in seen_paths - ): + if fp and fp not in seen_paths: file_paths_list.append(fp) seen_paths.add(fp) From 665f60b90f4611c180914ffe296b9cd51b3969ff Mon Sep 17 00:00:00 2001 From: yangdx Date: Tue, 21 Oct 2025 03:19:34 +0800 Subject: [PATCH 12/25] Refactor entity/relation merge to consolidate VDB operations within functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit • Move VDB upserts into merge functions • Fix early return data structure issues • Update status messages (IGNORE_NEW → KEEP) • Consolidate error handling paths • Improve relationship content format --- lightrag/operate.py | 214 ++++++++++++++++++++++++-------------------- 1 file changed, 117 insertions(+), 97 deletions(-) diff --git a/lightrag/operate.py b/lightrag/operate.py index a5e168be..290f19b6 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -1470,6 +1470,7 @@ async def _merge_nodes_then_upsert( entity_name: str, nodes_data: list[dict], knowledge_graph_inst: BaseGraphStorage, + entity_vdb: BaseVectorStorage | None, global_config: dict, pipeline_status: dict = None, pipeline_status_lock=None, @@ -1595,13 +1596,25 @@ async def _merge_nodes_then_upsert( if already_node else "(no description)" ) - llm_was_used = False - status_message = f"Skip merge for `{entity_name}`: IGNORE_NEW limit reached" + status_message = f"Skip merge for `{entity_name}`: KEEP limit reached" logger.debug(status_message) if pipeline_status is not None and pipeline_status_lock is not None: async with pipeline_status_lock: pipeline_status["latest_message"] = status_message pipeline_status["history_messages"].append(status_message) + existing_node_data = dict(already_node or {}) + if not existing_node_data: + existing_node_data = { + "entity_id": entity_name, + "entity_type": entity_type, + "description": description, + "source_id": GRAPH_FIELD_SEP.join(existing_full_source_ids), + "file_path": GRAPH_FIELD_SEP.join(already_file_paths), + "created_at": int(time.time()), + "truncate": "", + } + existing_node_data["entity_name"] = entity_name + return existing_node_data elif num_fragment > 0: # Get summary and LLM usage status description, llm_was_used = await _handle_entity_relation_summary( @@ -1726,6 +1739,25 @@ async def _merge_nodes_then_upsert( node_data=node_data, ) node_data["entity_name"] = entity_name + if entity_vdb is not None: + entity_vdb_id = compute_mdhash_id(str(entity_name), prefix="ent-") + entity_content = f"{entity_name}\n{description}" + data_for_vdb = { + entity_vdb_id: { + "entity_name": entity_name, + "entity_type": entity_type, + "content": entity_content, + "source_id": source_id, + "file_path": file_path, + } + } + await safe_vdb_operation_with_exception( + operation=lambda payload=data_for_vdb: entity_vdb.upsert(payload), + operation_name="entity_upsert", + entity_name=entity_name, + max_retries=3, + retry_delay=0.1, + ) return node_data @@ -1734,6 +1766,8 @@ async def _merge_edges_then_upsert( tgt_id: str, edges_data: list[dict], knowledge_graph_inst: BaseGraphStorage, + relationships_vdb: BaseVectorStorage | None, + entity_vdb: BaseVectorStorage | None, global_config: dict, pipeline_status: dict = None, pipeline_status_lock=None, @@ -1744,6 +1778,7 @@ async def _merge_edges_then_upsert( if src_id == tgt_id: return None + already_edge = None already_weights = [] already_source_ids = [] already_description = [] @@ -1825,13 +1860,13 @@ async def _merge_edges_then_upsert( global_config.get("source_ids_limit_method") or SOURCE_IDS_LIMIT_METHOD_KEEP ) - # Only apply filtering in IGNORE_NEW mode + # Only apply filtering in KEEP(ignore new) mode if limit_method == SOURCE_IDS_LIMIT_METHOD_KEEP: allowed_source_ids = set(source_ids) filtered_edges = [] for dp in edges_data: source_id = dp.get("source_id") - # Skip relationship fragments sourced from chunks dropped by the IGNORE_NEW cap + # Skip relationship fragments sourced from chunks dropped by keep oldest cap if ( source_id and source_id not in allowed_source_ids @@ -1889,15 +1924,29 @@ async def _merge_edges_then_upsert( if already_edge else "(no description)" ) - llm_was_used = False status_message = ( - f"Skip merge for `{src_id}`~`{tgt_id}`: IGNORE_NEW limit reached" + f"Skip merge for `{src_id}`~`{tgt_id}`: KEEP limit reached" ) logger.debug(status_message) if pipeline_status is not None and pipeline_status_lock is not None: async with pipeline_status_lock: pipeline_status["latest_message"] = status_message pipeline_status["history_messages"].append(status_message) + existing_edge_data = dict(already_edge or {}) + if not existing_edge_data: + existing_edge_data = { + "description": description, + "keywords": GRAPH_FIELD_SEP.join(already_keywords), + "source_id": GRAPH_FIELD_SEP.join(existing_full_source_ids), + "file_path": GRAPH_FIELD_SEP.join(already_file_paths), + "weight": sum(already_weights) if already_weights else 0.0, + "truncate": "", + "created_at": int(time.time()), + } + existing_edge_data.setdefault("created_at", int(time.time())) + existing_edge_data["src_id"] = src_id + existing_edge_data["tgt_id"] = tgt_id + return existing_edge_data elif num_fragment > 0: # Get summary and LLM usage status description, llm_was_used = await _handle_entity_relation_summary( @@ -2025,17 +2074,38 @@ async def _merge_edges_then_upsert( for need_insert_id in [src_id, tgt_id]: if not (await knowledge_graph_inst.has_node(need_insert_id)): + node_created_at = int(time.time()) node_data = { "entity_id": need_insert_id, "source_id": source_id, "description": description, "entity_type": "UNKNOWN", "file_path": file_path, - "created_at": int(time.time()), + "created_at": node_created_at, "truncate": "", } await knowledge_graph_inst.upsert_node(need_insert_id, node_data=node_data) + if entity_vdb is not None: + entity_vdb_id = compute_mdhash_id(need_insert_id, prefix="ent-") + entity_content = f"{need_insert_id}\n{description}" + vdb_data = { + entity_vdb_id: { + "content": entity_content, + "entity_name": need_insert_id, + "source_id": source_id, + "entity_type": "UNKNOWN", + "file_path": file_path, + } + } + await safe_vdb_operation_with_exception( + operation=lambda payload=vdb_data: entity_vdb.upsert(payload), + operation_name="added_entity_upsert", + entity_name=need_insert_id, + max_retries=3, + retry_delay=0.1, + ) + # Track entities added during edge processing if added_entities is not None: entity_data = { @@ -2044,10 +2114,11 @@ async def _merge_edges_then_upsert( "description": description, "source_id": source_id, "file_path": file_path, - "created_at": int(time.time()), + "created_at": node_created_at, } added_entities.append(entity_data) + edge_created_at = int(time.time()) await knowledge_graph_inst.upsert_edge( src_id, tgt_id, @@ -2057,7 +2128,7 @@ async def _merge_edges_then_upsert( keywords=keywords, source_id=source_id, file_path=file_path, - created_at=int(time.time()), + created_at=edge_created_at, truncate=truncation_info, ), ) @@ -2069,10 +2140,41 @@ async def _merge_edges_then_upsert( keywords=keywords, source_id=source_id, file_path=file_path, - created_at=int(time.time()), + created_at=edge_created_at, truncate=truncation_info, + weight=weight, ) + if relationships_vdb is not None: + rel_vdb_id = compute_mdhash_id(src_id + tgt_id, prefix="rel-") + rel_vdb_id_reverse = compute_mdhash_id(tgt_id + src_id, prefix="rel-") + try: + await relationships_vdb.delete([rel_vdb_id, rel_vdb_id_reverse]) + except Exception as e: + logger.debug( + f"Could not delete old relationship vector records {rel_vdb_id}, {rel_vdb_id_reverse}: {e}" + ) + rel_content = f"{keywords}\t{src_id}\n{tgt_id}\n{description}" + vdb_data = { + rel_vdb_id: { + "src_id": src_id, + "tgt_id": tgt_id, + "source_id": source_id, + "content": rel_content, + "keywords": keywords, + "description": description, + "weight": weight, + "file_path": file_path, + } + } + await safe_vdb_operation_with_exception( + operation=lambda payload=vdb_data: relationships_vdb.upsert(payload), + operation_name="relationship_upsert", + entity_name=f"{src_id}-{tgt_id}", + max_retries=3, + retry_delay=0.2, + ) + return edge_data @@ -2162,12 +2264,12 @@ async def merge_nodes_and_edges( [entity_name], namespace=namespace, enable_logging=False ): try: - logger.debug(f"Inserting {entity_name} in Graph") - # Graph database operation (critical path, must succeed) + logger.debug(f"Processing entity {entity_name}") entity_data = await _merge_nodes_then_upsert( entity_name, entities, knowledge_graph_inst, + entity_vdb, global_config, pipeline_status, pipeline_status_lock, @@ -2175,36 +2277,9 @@ async def merge_nodes_and_edges( entity_chunks_storage, ) - # Vector database operation (equally critical, must succeed) - if entity_vdb is not None and entity_data: - data_for_vdb = { - compute_mdhash_id( - str(entity_data["entity_name"]), prefix="ent-" - ): { - "entity_name": entity_data["entity_name"], - "entity_type": entity_data["entity_type"], - "content": f"{entity_data['entity_name']}\n{entity_data['description']}", - "source_id": entity_data["source_id"], - "file_path": entity_data.get( - "file_path", "unknown_source" - ), - } - } - - logger.debug(f"Inserting {entity_name} in Graph") - # Use safe operation wrapper - VDB failure must throw exception - await safe_vdb_operation_with_exception( - operation=lambda: entity_vdb.upsert(data_for_vdb), - operation_name="entity_upsert", - entity_name=entity_name, - max_retries=3, - retry_delay=0.1, - ) - return entity_data except Exception as e: - # Any database operation failure is critical error_msg = ( f"Critical error in entity processing for `{entity_name}`: {e}" ) @@ -2294,12 +2369,14 @@ async def merge_nodes_and_edges( try: added_entities = [] # Track entities added during edge processing - # Graph database operation (critical path, must succeed) + logger.debug(f"Processing relation {sorted_edge_key}") edge_data = await _merge_edges_then_upsert( edge_key[0], edge_key[1], edges, knowledge_graph_inst, + relationships_vdb, + entity_vdb, global_config, pipeline_status, pipeline_status_lock, @@ -2311,66 +2388,9 @@ async def merge_nodes_and_edges( if edge_data is None: return None, [] - # Vector database operation (equally critical, must succeed) - if relationships_vdb is not None: - data_for_vdb = { - compute_mdhash_id( - edge_data["src_id"] + edge_data["tgt_id"], prefix="rel-" - ): { - "src_id": edge_data["src_id"], - "tgt_id": edge_data["tgt_id"], - "keywords": edge_data["keywords"], - "content": f"{edge_data['src_id']}\t{edge_data['tgt_id']}\n{edge_data['keywords']}\n{edge_data['description']}", - "source_id": edge_data["source_id"], - "file_path": edge_data.get( - "file_path", "unknown_source" - ), - "weight": edge_data.get("weight", 1.0), - } - } - - # Use safe operation wrapper - VDB failure must throw exception - await safe_vdb_operation_with_exception( - operation=lambda: relationships_vdb.upsert(data_for_vdb), - operation_name="relationship_upsert", - entity_name=f"{edge_data['src_id']}-{edge_data['tgt_id']}", - max_retries=3, - retry_delay=0.1, - ) - - # Update added_entities to entity vector database using safe operation wrapper - if added_entities and entity_vdb is not None: - for entity_data in added_entities: - entity_vdb_id = compute_mdhash_id( - entity_data["entity_name"], prefix="ent-" - ) - entity_content = f"{entity_data['entity_name']}\n{entity_data['description']}" - - vdb_data = { - entity_vdb_id: { - "content": entity_content, - "entity_name": entity_data["entity_name"], - "source_id": entity_data["source_id"], - "entity_type": entity_data["entity_type"], - "file_path": entity_data.get( - "file_path", "unknown_source" - ), - } - } - - # Use safe operation wrapper - VDB failure must throw exception - await safe_vdb_operation_with_exception( - operation=lambda data=vdb_data: entity_vdb.upsert(data), - operation_name="added_entity_upsert", - entity_name=entity_data["entity_name"], - max_retries=3, - retry_delay=0.1, - ) - return edge_data, added_entities except Exception as e: - # Any database operation failure is critical error_msg = f"Critical error in relationship processing for `{sorted_edge_key}`: {e}" logger.error(error_msg) From 1154c5683fd3b9b3c378b0dea3f1791bcf7b09cf Mon Sep 17 00:00:00 2001 From: yangdx Date: Tue, 21 Oct 2025 04:41:15 +0800 Subject: [PATCH 13/25] Refactor deduplication calculation and remove unused variables --- lightrag/operate.py | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/lightrag/operate.py b/lightrag/operate.py index 290f19b6..325bea25 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -1498,8 +1498,6 @@ async def _merge_nodes_then_upsert( reverse=True, )[0][0] # Get the entity type with the highest count - original_nodes_count = len(nodes_data) - new_source_ids = [dp["source_id"] for dp in nodes_data if dp.get("source_id")] existing_full_source_ids = [] @@ -1584,12 +1582,12 @@ async def _merge_nodes_then_upsert( # Combine already_description with sorted new sorted descriptions description_list = already_description + sorted_descriptions - deduplicated_num = original_nodes_count - len(sorted_descriptions) + num_fragment = len(description_list) + already_fragment = len(already_description) + deduplicated_num = already_fragment + len(nodes_data) - num_fragment if deduplicated_num > 0: dd_message = f"dd:{deduplicated_num}" - num_fragment = len(description_list) - already_fragment = len(already_description) if skip_summary_due_to_limit: description = ( already_node.get("description", "(no description)") @@ -1818,8 +1816,6 @@ async def _merge_edges_then_upsert( ) ) - original_edges_count = len(edges_data) - new_source_ids = [dp["source_id"] for dp in edges_data if dp.get("source_id")] storage_key = make_relation_chunk_key(src_id, tgt_id) @@ -1911,12 +1907,12 @@ async def _merge_edges_then_upsert( # Combine already_description with sorted new descriptions description_list = already_description + sorted_descriptions - deduplicated_num = original_edges_count - len(sorted_descriptions) - if deduplicated_num > 0: - dd_message = f"dd:{deduplicated_num}" num_fragment = len(description_list) already_fragment = len(already_description) + deduplicated_num = already_fragment + len(edges_data) - num_fragment + if deduplicated_num > 0: + dd_message = f"dd:{deduplicated_num}" if skip_summary_due_to_limit: description = ( @@ -1924,9 +1920,7 @@ async def _merge_edges_then_upsert( if already_edge else "(no description)" ) - status_message = ( - f"Skip merge for `{src_id}`~`{tgt_id}`: KEEP limit reached" - ) + status_message = f"Skip merge for `{src_id}`~`{tgt_id}`: KEEP limit reached" logger.debug(status_message) if pipeline_status is not None and pipeline_status_lock is not None: async with pipeline_status_lock: From 019dff5248d1d5111e9fbf8791cbc6423ad7c4a7 Mon Sep 17 00:00:00 2001 From: yangdx Date: Tue, 21 Oct 2025 04:46:07 +0800 Subject: [PATCH 14/25] Update truncation message format in properties tooltip --- lightrag_webui/src/components/graph/PropertiesView.tsx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lightrag_webui/src/components/graph/PropertiesView.tsx b/lightrag_webui/src/components/graph/PropertiesView.tsx index b46eb5b4..39b9a448 100644 --- a/lightrag_webui/src/components/graph/PropertiesView.tsx +++ b/lightrag_webui/src/components/graph/PropertiesView.tsx @@ -222,7 +222,7 @@ const PropertyRow = ({ // If this is source_id field and truncate info exists, append it to the tooltip if (name === 'source_id' && truncate) { - formattedTooltip += `\n(${truncate} truncated)` + formattedTooltip += `\n(Truncation-${truncate})` } // Use EditablePropertyRow for editable fields (description, entity_id and keywords) From cd1c48beaf33250a1abaf3fe3cd793822f413aea Mon Sep 17 00:00:00 2001 From: yangdx Date: Tue, 21 Oct 2025 05:03:57 +0800 Subject: [PATCH 15/25] Standardize placeholder format to use colon separator consistently --- lightrag/operate.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/lightrag/operate.py b/lightrag/operate.py index 325bea25..15f18dca 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -1188,7 +1188,7 @@ async def _rebuild_single_entity( file_paths_list = file_paths_list[:max_file_paths] file_paths_list.append( - f"...{file_path_placeholder}({limit_method} {max_file_paths}/{original_count})..." + f"...{file_path_placeholder}({limit_method}:{max_file_paths}/{original_count})..." ) logger.info( f"Limited `{entity_name}`: file_path {original_count} -> {max_file_paths} ({limit_method})" @@ -1347,7 +1347,7 @@ async def _rebuild_single_relationship( file_paths_list = file_paths_list[:max_file_paths] file_paths_list.append( - f"...{file_path_placeholder}({limit_method} {max_file_paths}/{original_count})..." + f"...{file_path_placeholder}({limit_method}:{max_file_paths}/{original_count})..." ) logger.info( f"Limited `{src}`~`{tgt}`: file_path {original_count} -> {max_file_paths} ({limit_method})" @@ -1680,10 +1680,9 @@ async def _merge_nodes_then_upsert( # Collect from already_file_paths, excluding placeholder for fp in already_file_paths: # Check if this is a placeholder record - if fp and fp.startswith(f"...{file_path_placeholder}"): + if fp and fp.startswith(f"...{file_path_placeholder}"): # Skip placeholders has_placeholder = True continue - # Skip placeholders (format: "...{placeholder}(showing X of Y)...") if fp and fp not in seen_paths: file_paths_list.append(fp) seen_paths.add(fp) @@ -1715,7 +1714,7 @@ async def _merge_nodes_then_upsert( file_paths_list = file_paths_list[:max_file_paths] file_paths_list.append( - f"...{file_path_placeholder}({limit_method} {max_file_paths}/{original_count})..." + f"...{file_path_placeholder}({limit_method}:{max_file_paths}/{original_count})..." ) logger.info( f"Limited `{entity_name}`: file_path {original_count} -> {max_file_paths} ({limit_method})" @@ -2023,10 +2022,9 @@ async def _merge_edges_then_upsert( # Collect from already_file_paths, excluding placeholder for fp in already_file_paths: # Check if this is a placeholder record - if fp and fp.startswith(f"...{file_path_placeholder}"): + if fp and fp.startswith(f"...{file_path_placeholder}"): # Skip placeholders has_placeholder = True continue - # Skip placeholders (format: "...{placeholder}(showing X of Y)...") if fp and fp not in seen_paths: file_paths_list.append(fp) seen_paths.add(fp) @@ -2058,7 +2056,7 @@ async def _merge_edges_then_upsert( file_paths_list = file_paths_list[:max_file_paths] file_paths_list.append( - f"...{file_path_placeholder}({limit_method} {max_file_paths}/{original_count})..." + f"...{file_path_placeholder}({limit_method}:{max_file_paths}/{original_count})..." ) logger.info( f"Limited `{src_id}`~`{tgt_id}`: file_path {original_count} -> {max_file_paths} ({limit_method})" From 1248b3ab0436c0d9aebc96cde611890440b89153 Mon Sep 17 00:00:00 2001 From: yangdx Date: Tue, 21 Oct 2025 05:30:09 +0800 Subject: [PATCH 16/25] Increase default limits for source IDs and file paths in metadata MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit • Entity source IDs: 3 → 300 • Relation source IDs: 3 → 300 • File paths: 2 → 30 --- lightrag/constants.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lightrag/constants.py b/lightrag/constants.py index ad12cccf..7c2b2701 100644 --- a/lightrag/constants.py +++ b/lightrag/constants.py @@ -58,8 +58,8 @@ DEFAULT_MIN_RERANK_SCORE = 0.0 DEFAULT_RERANK_BINDING = "null" # Default source ids limit in meta data for entity and relation -DEFAULT_MAX_SOURCE_IDS_PER_ENTITY = 3 -DEFAULT_MAX_SOURCE_IDS_PER_RELATION = 3 +DEFAULT_MAX_SOURCE_IDS_PER_ENTITY = 300 +DEFAULT_MAX_SOURCE_IDS_PER_RELATION = 300 SOURCE_IDS_LIMIT_METHOD_KEEP = "KEEP" # Keep oldest SOURCE_IDS_LIMIT_METHOD_FIFO = "FIFO" # First In First Out (Keep newest) DEFAULT_SOURCE_IDS_LIMIT_METHOD = SOURCE_IDS_LIMIT_METHOD_KEEP @@ -68,7 +68,7 @@ VALID_SOURCE_IDS_LIMIT_METHODS = { SOURCE_IDS_LIMIT_METHOD_FIFO, } # Default file_path limit in meta data for entity and relation (Use same limit method as source_ids) -DEFAULT_MAX_FILE_PATHS = 2 +DEFAULT_MAX_FILE_PATHS = 30 # Field length of file_path in Milvus Schema for entity and relation (Should not be changed) # file_path must store all file paths up to the DEFAULT_MAX_FILE_PATHS limit within the metadata. From a5253244f97d5fc8930ad0cd12270ab557d1f4f3 Mon Sep 17 00:00:00 2001 From: yangdx Date: Tue, 21 Oct 2025 06:33:34 +0800 Subject: [PATCH 17/25] Simplify skip logging and reduce pipeline status updates --- lightrag/operate.py | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/lightrag/operate.py b/lightrag/operate.py index 15f18dca..4b34f474 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -1589,17 +1589,12 @@ async def _merge_nodes_then_upsert( dd_message = f"dd:{deduplicated_num}" if skip_summary_due_to_limit: + logger.info(f"Skipped `{entity_name}`: KEEP old chunks") description = ( already_node.get("description", "(no description)") if already_node else "(no description)" ) - status_message = f"Skip merge for `{entity_name}`: KEEP limit reached" - logger.debug(status_message) - if pipeline_status is not None and pipeline_status_lock is not None: - async with pipeline_status_lock: - pipeline_status["latest_message"] = status_message - pipeline_status["history_messages"].append(status_message) existing_node_data = dict(already_node or {}) if not existing_node_data: existing_node_data = { @@ -1914,17 +1909,12 @@ async def _merge_edges_then_upsert( dd_message = f"dd:{deduplicated_num}" if skip_summary_due_to_limit: + logger.info(f"Skipped `{src_id}`~`{tgt_id}`: KEEP old chunks") description = ( already_edge.get("description", "(no description)") if already_edge else "(no description)" ) - status_message = f"Skip merge for `{src_id}`~`{tgt_id}`: KEEP limit reached" - logger.debug(status_message) - if pipeline_status is not None and pipeline_status_lock is not None: - async with pipeline_status_lock: - pipeline_status["latest_message"] = status_message - pipeline_status["history_messages"].append(status_message) existing_edge_data = dict(already_edge or {}) if not existing_edge_data: existing_edge_data = { From be3d274a0b852962d56abd5c1d33bf86c8288700 Mon Sep 17 00:00:00 2001 From: yangdx Date: Tue, 21 Oct 2025 15:16:47 +0800 Subject: [PATCH 18/25] Refactor node and edge merging logic with improved code structure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit • Add numbered steps for clarity • Improve early return handling • Enhance file path limiting logic --- lightrag/operate.py | 553 +++++++++++++++++++++----------------------- 1 file changed, 262 insertions(+), 291 deletions(-) diff --git a/lightrag/operate.py b/lightrag/operate.py index 4b34f474..60fa66a3 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -57,6 +57,7 @@ from lightrag.constants import ( SOURCE_IDS_LIMIT_METHOD_KEEP, SOURCE_IDS_LIMIT_METHOD_FIFO, DEFAULT_FILE_PATH_MORE_PLACEHOLDER, + DEFAULT_MAX_FILE_PATHS, ) from lightrag.kg.shared_storage import get_storage_keyed_lock import time @@ -1483,6 +1484,7 @@ async def _merge_nodes_then_upsert( already_description = [] already_file_paths = [] + # 1. Get existing node data from knowledge graph already_node = await knowledge_graph_inst.get_node(entity_name) if already_node: already_entity_types.append(already_node["entity_type"]) @@ -1490,14 +1492,6 @@ async def _merge_nodes_then_upsert( already_file_paths.extend(already_node["file_path"].split(GRAPH_FIELD_SEP)) already_description.extend(already_node["description"].split(GRAPH_FIELD_SEP)) - entity_type = sorted( - Counter( - [dp["entity_type"] for dp in nodes_data] + already_entity_types - ).items(), - key=lambda x: x[1], - reverse=True, - )[0][0] # Get the entity type with the highest count - new_source_ids = [dp["source_id"] for dp in nodes_data if dp.get("source_id")] existing_full_source_ids = [] @@ -1513,6 +1507,7 @@ async def _merge_nodes_then_upsert( chunk_id for chunk_id in already_source_ids if chunk_id ] + # 2. Merging new source ids with existing ones full_source_ids = merge_source_ids(existing_full_source_ids, new_source_ids) if entity_chunks_storage is not None and full_source_ids: @@ -1525,6 +1520,7 @@ async def _merge_nodes_then_upsert( } ) + # 3. Finalize source_id by applying source ids limit limit_method = global_config.get("source_ids_limit_method") max_source_limit = global_config.get("max_source_ids_per_entity") source_ids = apply_source_ids_limit( @@ -1534,7 +1530,7 @@ async def _merge_nodes_then_upsert( identifier=f"`{entity_name}`", ) - # Only apply filtering in KEEP(ignore new) mode + # 4. Only keep nodes not filter by apply_source_ids_limit if limit_method is KEEP if limit_method == SOURCE_IDS_LIMIT_METHOD_KEEP: allowed_source_ids = set(source_ids) filtered_nodes = [] @@ -1549,18 +1545,38 @@ async def _merge_nodes_then_upsert( continue filtered_nodes.append(dp) nodes_data = filtered_nodes - else: - # In FIFO mode, keep all node descriptions - truncation happens at source_ids level only + else: # In FIFO mode, keep all nodes - truncation happens at source_ids level only nodes_data = list(nodes_data) - skip_summary_due_to_limit = ( + # 5. Check if we need to skip summary due to source_ids limit + if ( limit_method == SOURCE_IDS_LIMIT_METHOD_KEEP and len(existing_full_source_ids) >= max_source_limit and not nodes_data - and already_description - ) + ): + if already_node: + logger.info(f"Skipped `{entity_name}`: KEEP old chunks") + existing_node_data = dict(already_node) + return existing_node_data + else: + logger.error(f"Internal Error: already_node missing for `{entity_name}`") + raise ValueError( + f"Internal Error: already_node missing for `{entity_name}`" + ) - # Deduplicate by description, keeping first occurrence + # 6.1 Finalize source_id + source_id = GRAPH_FIELD_SEP.join(source_ids) + + # 6.2 Finalize entity type by highest count + entity_type = sorted( + Counter( + [dp["entity_type"] for dp in nodes_data] + already_entity_types + ).items(), + key=lambda x: x[1], + reverse=True, + )[0][0] + + # 7. Deduplicate nodes by description, keeping first occurrence in the same document unique_nodes = {} for dp in nodes_data: desc = dp.get("description") @@ -1569,154 +1585,121 @@ async def _merge_nodes_then_upsert( if desc not in unique_nodes: unique_nodes[desc] = dp - # Sort description by timestamp, then by description length (largest to smallest) when timestamps are the same + # Sort description by timestamp, then by description length when timestamps are the same sorted_nodes = sorted( unique_nodes.values(), key=lambda x: (x.get("timestamp", 0), -len(x.get("description", ""))), ) sorted_descriptions = [dp["description"] for dp in sorted_nodes] - truncation_info = "" - dd_message = "" - has_placeholder = False # Initialize to track placeholder in file paths - # Combine already_description with sorted new sorted descriptions description_list = already_description + sorted_descriptions - num_fragment = len(description_list) - already_fragment = len(already_description) - deduplicated_num = already_fragment + len(nodes_data) - num_fragment - if deduplicated_num > 0: - dd_message = f"dd:{deduplicated_num}" - - if skip_summary_due_to_limit: - logger.info(f"Skipped `{entity_name}`: KEEP old chunks") - description = ( - already_node.get("description", "(no description)") - if already_node - else "(no description)" - ) - existing_node_data = dict(already_node or {}) - if not existing_node_data: - existing_node_data = { - "entity_id": entity_name, - "entity_type": entity_type, - "description": description, - "source_id": GRAPH_FIELD_SEP.join(existing_full_source_ids), - "file_path": GRAPH_FIELD_SEP.join(already_file_paths), - "created_at": int(time.time()), - "truncate": "", - } - existing_node_data["entity_name"] = entity_name - return existing_node_data - elif num_fragment > 0: - # Get summary and LLM usage status - description, llm_was_used = await _handle_entity_relation_summary( - "Entity", - entity_name, - description_list, - GRAPH_FIELD_SEP, - global_config, - llm_response_cache, - ) - - # Log based on actual LLM usage - if llm_was_used: - status_message = f"LLMmrg: `{entity_name}` | {already_fragment}+{num_fragment - already_fragment}" - else: - status_message = f"Merged: `{entity_name}` | {already_fragment}+{num_fragment - already_fragment}" - - # Add truncation info from apply_source_ids_limit if truncation occurred - if len(source_ids) < len(full_source_ids): - # Add + sign if has_placeholder is True, indicating actual file count is higher - full_source_count_str = ( - f"{len(full_source_ids)}+" - if has_placeholder - else str(len(full_source_ids)) - ) - truncation_info = ( - f"{limit_method}:{len(source_ids)}/{full_source_count_str}" - ) - - if dd_message or truncation_info: - status_message += ( - f" ({', '.join(filter(None, [truncation_info, dd_message]))})" - ) - - if already_fragment > 0 or llm_was_used: - logger.info(status_message) - if pipeline_status is not None and pipeline_status_lock is not None: - async with pipeline_status_lock: - pipeline_status["latest_message"] = status_message - pipeline_status["history_messages"].append(status_message) - else: - logger.debug(status_message) - - else: + if not description_list: logger.error(f"Entity {entity_name} has no description") - description = "(no description)" + raise ValueError(f"Entity {entity_name} has no description") - source_id = GRAPH_FIELD_SEP.join(source_ids) + # 8. Get summary description an LLM usage status + description, llm_was_used = await _handle_entity_relation_summary( + "Entity", + entity_name, + description_list, + GRAPH_FIELD_SEP, + global_config, + llm_response_cache, + ) - # Build file_path with count limit - if skip_summary_due_to_limit: - # Skip limit, keep original file_path - file_path = GRAPH_FIELD_SEP.join(fp for fp in already_file_paths if fp) - else: - # Collect and apply limit - file_paths_list = [] - seen_paths = set() - has_placeholder = False # Track if already_file_paths contains placeholder + # 9. Build file_path within MAX_FILE_PATHS + file_paths_list = [] + seen_paths = set() + has_placeholder = False # Indicating file_path has been truncated before - # Get placeholder to filter it out + max_file_paths = global_config.get("max_file_paths", DEFAULT_MAX_FILE_PATHS) + file_path_placeholder = global_config.get( + "file_path_more_placeholder", DEFAULT_FILE_PATH_MORE_PLACEHOLDER + ) + + # Collect from already_file_paths, excluding placeholder + for fp in already_file_paths: + if fp and fp.startswith(f"...{file_path_placeholder}"): # Skip placeholders + has_placeholder = True + continue + if fp and fp not in seen_paths: + file_paths_list.append(fp) + seen_paths.add(fp) + + # Collect from new data + for dp in nodes_data: + file_path_item = dp.get("file_path") + if file_path_item and file_path_item not in seen_paths: + file_paths_list.append(file_path_item) + seen_paths.add(file_path_item) + + # Apply count limit + if len(file_paths_list) > max_file_paths: + limit_method = global_config.get( + "source_ids_limit_method", SOURCE_IDS_LIMIT_METHOD_KEEP + ) file_path_placeholder = global_config.get( "file_path_more_placeholder", DEFAULT_FILE_PATH_MORE_PLACEHOLDER ) + # Add + sign to indicate actual file count is higher + original_count_str = ( + f"{len(file_paths_list)}+" if has_placeholder else str(len(file_paths_list)) + ) - # Collect from already_file_paths, excluding placeholder - for fp in already_file_paths: - # Check if this is a placeholder record - if fp and fp.startswith(f"...{file_path_placeholder}"): # Skip placeholders - has_placeholder = True - continue - if fp and fp not in seen_paths: - file_paths_list.append(fp) - seen_paths.add(fp) + if limit_method == SOURCE_IDS_LIMIT_METHOD_FIFO: + # FIFO: keep tail (newest), discard head + file_paths_list = file_paths_list[-max_file_paths:] + else: + # KEEP: keep head (earliest), discard tail + file_paths_list = file_paths_list[:max_file_paths] - # Collect from new data - for dp in nodes_data: - file_path_item = dp.get("file_path") - if file_path_item and file_path_item not in seen_paths: - file_paths_list.append(file_path_item) - seen_paths.add(file_path_item) + file_paths_list.append(f"...{file_path_placeholder}({limit_method})...") + logger.info( + f"Limited `{entity_name}`: file_path {original_count_str} -> {max_file_paths} ({limit_method})" + ) + # Finalize file_path + file_path = GRAPH_FIELD_SEP.join(file_paths_list) - # Apply count limit - max_file_paths = global_config.get("max_file_paths") + # 10.Log based on actual LLM usage + num_fragment = len(description_list) + already_fragment = len(already_description) + if llm_was_used: + status_message = f"LLMmrg: `{entity_name}` | {already_fragment}+{num_fragment - already_fragment}" + else: + status_message = f"Merged: `{entity_name}` | {already_fragment}+{num_fragment - already_fragment}" - if len(file_paths_list) > max_file_paths: - limit_method = global_config.get( - "source_ids_limit_method", SOURCE_IDS_LIMIT_METHOD_KEEP - ) - file_path_placeholder = global_config.get( - "file_path_more_placeholder", DEFAULT_FILE_PATH_MORE_PLACEHOLDER - ) - original_count = len(file_paths_list) + truncation_info = truncation_info_log = "" + if len(source_ids) < len(full_source_ids): + # Add truncation info from apply_source_ids_limit if truncation occurred + truncation_info_log = f"{limit_method} {len(source_ids)}/{len(full_source_ids)}" + if limit_method == SOURCE_IDS_LIMIT_METHOD_FIFO: + truncation_info = truncation_info_log + else: + truncation_info = "Keep Old Chunks" - if limit_method == SOURCE_IDS_LIMIT_METHOD_FIFO: - # FIFO: keep tail (newest), discard head - file_paths_list = file_paths_list[-max_file_paths:] - else: - # KEEP: keep head (earliest), discard tail - file_paths_list = file_paths_list[:max_file_paths] + deduplicated_num = already_fragment + len(nodes_data) - num_fragment + dd_message = "" + if deduplicated_num > 0: + # Duplicated description detected across multiple trucks for the same entity + dd_message = f"dd {deduplicated_num}" - file_paths_list.append( - f"...{file_path_placeholder}({limit_method}:{max_file_paths}/{original_count})..." - ) - logger.info( - f"Limited `{entity_name}`: file_path {original_count} -> {max_file_paths} ({limit_method})" - ) + if dd_message or truncation_info_log: + status_message += ( + f" ({', '.join(filter(None, [truncation_info_log, dd_message]))})" + ) - file_path = GRAPH_FIELD_SEP.join(file_paths_list) + # Add message to pipeline satus when merge happens + if already_fragment > 0 or llm_was_used: + logger.info(status_message) + if pipeline_status is not None and pipeline_status_lock is not None: + async with pipeline_status_lock: + pipeline_status["latest_message"] = status_message + pipeline_status["history_messages"].append(status_message) + else: + logger.debug(status_message) + # 11. Update both graph and vector db node_data = dict( entity_id=entity_name, entity_type=entity_type, @@ -1777,6 +1760,7 @@ async def _merge_edges_then_upsert( already_keywords = [] already_file_paths = [] + # 1. Get existing edge data from graph storage if await knowledge_graph_inst.has_edge(src_id, tgt_id): already_edge = await knowledge_graph_inst.get_edge(src_id, tgt_id) # Handle the case where get_edge returns None or missing fields @@ -1826,6 +1810,7 @@ async def _merge_edges_then_upsert( chunk_id for chunk_id in already_source_ids if chunk_id ] + # 2. Merge new source ids with existing ones full_source_ids = merge_source_ids(existing_full_source_ids, new_source_ids) if relation_chunks_storage is not None and full_source_ids: @@ -1838,6 +1823,7 @@ async def _merge_edges_then_upsert( } ) + # 3. Finalize source_id by applying source ids limit limit_method = global_config.get("source_ids_limit_method") max_source_limit = global_config.get("max_source_ids_per_relation") source_ids = apply_source_ids_limit( @@ -1850,7 +1836,7 @@ async def _merge_edges_then_upsert( global_config.get("source_ids_limit_method") or SOURCE_IDS_LIMIT_METHOD_KEEP ) - # Only apply filtering in KEEP(ignore new) mode + # 4. Only keep edges with source_id in the final source_ids list if in KEEP mode if limit_method == SOURCE_IDS_LIMIT_METHOD_KEEP: allowed_source_ids = set(source_ids) filtered_edges = [] @@ -1865,21 +1851,49 @@ async def _merge_edges_then_upsert( continue filtered_edges.append(dp) edges_data = filtered_edges - else: - # In FIFO mode, keep all edge descriptions - truncation happens at source_ids level only + else: # In FIFO mode, keep all edges - truncation happens at source_ids level only edges_data = list(edges_data) - skip_summary_due_to_limit = ( + # 5. Check if we need to skip summary due to source_ids limit + if ( limit_method == SOURCE_IDS_LIMIT_METHOD_KEEP and len(existing_full_source_ids) >= max_source_limit and not edges_data - and already_description - ) + ): + if already_edge: + logger.info(f"Skipped `{src_id}`~`{tgt_id}`: KEEP old chunks") + existing_edge_data = dict(already_edge) + return existing_edge_data + else: + logger.error( + f"Internal Error: already_node missing for `{src_id}`~`{tgt_id}`" + ) + raise ValueError( + f"Internal Error: already_node missing for `{src_id}`~`{tgt_id}`" + ) - # Process edges_data with None checks + # 6.1 Finalize source_id + source_id = GRAPH_FIELD_SEP.join(source_ids) + + # 6.2 Finalize weight by summing new edges and existing weights weight = sum([dp["weight"] for dp in edges_data] + already_weights) - # Deduplicate by description, keeping first occurrence + # 6.2 Finalize keywords by merging existing and new keywords + all_keywords = set() + # Process already_keywords (which are comma-separated) + for keyword_str in already_keywords: + if keyword_str: # Skip empty strings + all_keywords.update(k.strip() for k in keyword_str.split(",") if k.strip()) + # Process new keywords from edges_data + for edge in edges_data: + if edge.get("keywords"): + all_keywords.update( + k.strip() for k in edge["keywords"].split(",") if k.strip() + ) + # Join all unique keywords with commas + keywords = ",".join(sorted(all_keywords)) + + # 7. Deduplicate by description, keeping first occurrence in the same document unique_edges = {} for dp in edges_data: description_value = dp.get("description") @@ -1895,165 +1909,122 @@ async def _merge_edges_then_upsert( ) sorted_descriptions = [dp["description"] for dp in sorted_edges] - truncation_info = "" - dd_message = "" - has_placeholder = False # Initialize to track placeholder in file paths - # Combine already_description with sorted new descriptions description_list = already_description + sorted_descriptions + if not description_list: + logger.error(f"Relation {src_id}~{tgt_id} has no description") + raise ValueError(f"Relation {src_id}~{tgt_id} has no description") - num_fragment = len(description_list) - already_fragment = len(already_description) - deduplicated_num = already_fragment + len(edges_data) - num_fragment - if deduplicated_num > 0: - dd_message = f"dd:{deduplicated_num}" + # 8. Get summary description an LLM usage status + description, llm_was_used = await _handle_entity_relation_summary( + "Relation", + f"({src_id}, {tgt_id})", + description_list, + GRAPH_FIELD_SEP, + global_config, + llm_response_cache, + ) - if skip_summary_due_to_limit: - logger.info(f"Skipped `{src_id}`~`{tgt_id}`: KEEP old chunks") - description = ( - already_edge.get("description", "(no description)") - if already_edge - else "(no description)" + # 9. Build file_path within MAX_FILE_PATHS limit + file_paths_list = [] + seen_paths = set() + has_placeholder = False # Track if already_file_paths contains placeholder + + max_file_paths = global_config.get("max_file_paths", DEFAULT_MAX_FILE_PATHS) + file_path_placeholder = global_config.get( + "file_path_more_placeholder", DEFAULT_FILE_PATH_MORE_PLACEHOLDER + ) + + # Collect from already_file_paths, excluding placeholder + for fp in already_file_paths: + # Check if this is a placeholder record + if fp and fp.startswith(f"...{file_path_placeholder}"): # Skip placeholders + has_placeholder = True + continue + if fp and fp not in seen_paths: + file_paths_list.append(fp) + seen_paths.add(fp) + + # Collect from new data + for dp in edges_data: + file_path_item = dp.get("file_path") + if file_path_item and file_path_item not in seen_paths: + file_paths_list.append(file_path_item) + seen_paths.add(file_path_item) + + # Apply count limit + max_file_paths = global_config.get("max_file_paths") + + if len(file_paths_list) > max_file_paths: + limit_method = global_config.get( + "source_ids_limit_method", SOURCE_IDS_LIMIT_METHOD_KEEP ) - existing_edge_data = dict(already_edge or {}) - if not existing_edge_data: - existing_edge_data = { - "description": description, - "keywords": GRAPH_FIELD_SEP.join(already_keywords), - "source_id": GRAPH_FIELD_SEP.join(existing_full_source_ids), - "file_path": GRAPH_FIELD_SEP.join(already_file_paths), - "weight": sum(already_weights) if already_weights else 0.0, - "truncate": "", - "created_at": int(time.time()), - } - existing_edge_data.setdefault("created_at", int(time.time())) - existing_edge_data["src_id"] = src_id - existing_edge_data["tgt_id"] = tgt_id - return existing_edge_data - elif num_fragment > 0: - # Get summary and LLM usage status - description, llm_was_used = await _handle_entity_relation_summary( - "Relation", - f"({src_id}, {tgt_id})", - description_list, - GRAPH_FIELD_SEP, - global_config, - llm_response_cache, - ) - - # Log based on actual LLM usage - if llm_was_used: - status_message = f"LLMmrg: `{src_id}`~`{tgt_id}` | {already_fragment}+{num_fragment - already_fragment}" - else: - status_message = f"Merged: `{src_id}`~`{tgt_id}` | {already_fragment}+{num_fragment - already_fragment}" - - # Add truncation info from apply_source_ids_limit if truncation occurred - if len(source_ids) < len(full_source_ids): - # Add + sign if has_placeholder is True, indicating actual file count is higher - full_source_count_str = ( - f"{len(full_source_ids)}+" - if has_placeholder - else str(len(full_source_ids)) - ) - truncation_info = ( - f"{limit_method}:{len(source_ids)}/{full_source_count_str}" - ) - - if dd_message or truncation_info: - status_message += ( - f" ({', '.join(filter(None, [truncation_info, dd_message]))})" - ) - - if already_fragment > 0 or llm_was_used: - logger.info(status_message) - if pipeline_status is not None and pipeline_status_lock is not None: - async with pipeline_status_lock: - pipeline_status["latest_message"] = status_message - pipeline_status["history_messages"].append(status_message) - else: - logger.debug(status_message) - - else: - logger.error(f"Edge {src_id} - {tgt_id} has no description") - description = "(no description)" - - # Split all existing and new keywords into individual terms, then combine and deduplicate - all_keywords = set() - # Process already_keywords (which are comma-separated) - for keyword_str in already_keywords: - if keyword_str: # Skip empty strings - all_keywords.update(k.strip() for k in keyword_str.split(",") if k.strip()) - # Process new keywords from edges_data - for edge in edges_data: - if edge.get("keywords"): - all_keywords.update( - k.strip() for k in edge["keywords"].split(",") if k.strip() - ) - # Join all unique keywords with commas - keywords = ",".join(sorted(all_keywords)) - - source_id = GRAPH_FIELD_SEP.join(source_ids) - - # Build file_path with count limit - if skip_summary_due_to_limit: - # Skip limit, keep original file_path - file_path = GRAPH_FIELD_SEP.join(fp for fp in already_file_paths if fp) - else: - # Collect and apply limit - file_paths_list = [] - seen_paths = set() - has_placeholder = False # Track if already_file_paths contains placeholder - - # Get placeholder to filter it out file_path_placeholder = global_config.get( "file_path_more_placeholder", DEFAULT_FILE_PATH_MORE_PLACEHOLDER ) - # Collect from already_file_paths, excluding placeholder - for fp in already_file_paths: - # Check if this is a placeholder record - if fp and fp.startswith(f"...{file_path_placeholder}"): # Skip placeholders - has_placeholder = True - continue - if fp and fp not in seen_paths: - file_paths_list.append(fp) - seen_paths.add(fp) + # Add + sign to indicate actual file count is higher + original_count_str = ( + f"{len(file_paths_list)}+" if has_placeholder else str(len(file_paths_list)) + ) - # Collect from new data - for dp in edges_data: - file_path_item = dp.get("file_path") - if file_path_item and file_path_item not in seen_paths: - file_paths_list.append(file_path_item) - seen_paths.add(file_path_item) + if limit_method == SOURCE_IDS_LIMIT_METHOD_FIFO: + # FIFO: keep tail (newest), discard head + file_paths_list = file_paths_list[-max_file_paths:] + else: + # KEEP: keep head (earliest), discard tail + file_paths_list = file_paths_list[:max_file_paths] - # Apply count limit - max_file_paths = global_config.get("max_file_paths") + # Add + sign if has_placeholder is True, indicating actual file count is higher - if len(file_paths_list) > max_file_paths: - limit_method = global_config.get( - "source_ids_limit_method", SOURCE_IDS_LIMIT_METHOD_KEEP - ) - file_path_placeholder = global_config.get( - "file_path_more_placeholder", DEFAULT_FILE_PATH_MORE_PLACEHOLDER - ) - original_count = len(file_paths_list) + file_paths_list.append( + f"...{file_path_placeholder}({limit_method}:{max_file_paths}/{original_count_str})..." + ) + logger.info( + f"Limited `{src_id}`~`{tgt_id}`: file_path {original_count_str} -> {max_file_paths} ({limit_method})" + ) + # Finalize file_path + file_path = GRAPH_FIELD_SEP.join(file_paths_list) - if limit_method == SOURCE_IDS_LIMIT_METHOD_FIFO: - # FIFO: keep tail (newest), discard head - file_paths_list = file_paths_list[-max_file_paths:] - else: - # KEEP: keep head (earliest), discard tail - file_paths_list = file_paths_list[:max_file_paths] + # 10. Log based on actual LLM usage + num_fragment = len(description_list) + already_fragment = len(already_description) + if llm_was_used: + status_message = f"LLMmrg: `{src_id}`~`{tgt_id}` | {already_fragment}+{num_fragment - already_fragment}" + else: + status_message = f"Merged: `{src_id}`~`{tgt_id}` | {already_fragment}+{num_fragment - already_fragment}" - file_paths_list.append( - f"...{file_path_placeholder}({limit_method}:{max_file_paths}/{original_count})..." - ) - logger.info( - f"Limited `{src_id}`~`{tgt_id}`: file_path {original_count} -> {max_file_paths} ({limit_method})" - ) + truncation_info = truncation_info_log = "" + if len(source_ids) < len(full_source_ids): + # Add truncation info from apply_source_ids_limit if truncation occurred + truncation_info_log = f"{limit_method} {len(source_ids)}/{len(full_source_ids)}" + if limit_method == SOURCE_IDS_LIMIT_METHOD_FIFO: + truncation_info = truncation_info_log + else: + truncation_info = "Keep Old Chunks" - file_path = GRAPH_FIELD_SEP.join(file_paths_list) + deduplicated_num = already_fragment + len(edges_data) - num_fragment + dd_message = "" + if deduplicated_num > 0: + # Duplicated description detected across multiple trucks for the same entity + dd_message = f"dd {deduplicated_num}" + if dd_message or truncation_info_log: + status_message += ( + f" ({', '.join(filter(None, [truncation_info_log, dd_message]))})" + ) + + # Add message to pipeline satus when merge happens + if already_fragment > 0 or llm_was_used: + logger.info(status_message) + if pipeline_status is not None and pipeline_status_lock is not None: + async with pipeline_status_lock: + pipeline_status["latest_message"] = status_message + pipeline_status["history_messages"].append(status_message) + else: + logger.debug(status_message) + + # 11. Update both graph and vector db for need_insert_id in [src_id, tgt_id]: if not (await knowledge_graph_inst.has_node(need_insert_id)): node_created_at = int(time.time()) From 80668aae229d07f6404a789925ae8d2897359325 Mon Sep 17 00:00:00 2001 From: yangdx Date: Tue, 21 Oct 2025 15:39:31 +0800 Subject: [PATCH 19/25] Improve file path truncation labels and UI consistency MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit • Standardize FIFO/KEEP truncation labels • Update UI truncation text format --- lightrag/operate.py | 14 ++++++-------- .../src/components/graph/PropertiesView.tsx | 2 +- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/lightrag/operate.py b/lightrag/operate.py index 60fa66a3..c3816cec 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -1650,11 +1650,12 @@ async def _merge_nodes_then_upsert( if limit_method == SOURCE_IDS_LIMIT_METHOD_FIFO: # FIFO: keep tail (newest), discard head file_paths_list = file_paths_list[-max_file_paths:] + file_paths_list.append(f"...{file_path_placeholder}...(FIFO)") else: # KEEP: keep head (earliest), discard tail file_paths_list = file_paths_list[:max_file_paths] + file_paths_list.append(f"...{file_path_placeholder}...(KEEP Old)") - file_paths_list.append(f"...{file_path_placeholder}({limit_method})...") logger.info( f"Limited `{entity_name}`: file_path {original_count_str} -> {max_file_paths} ({limit_method})" ) @@ -1676,7 +1677,7 @@ async def _merge_nodes_then_upsert( if limit_method == SOURCE_IDS_LIMIT_METHOD_FIFO: truncation_info = truncation_info_log else: - truncation_info = "Keep Old Chunks" + truncation_info = "KEEP Old" deduplicated_num = already_fragment + len(nodes_data) - num_fragment dd_message = "" @@ -1971,15 +1972,12 @@ async def _merge_edges_then_upsert( if limit_method == SOURCE_IDS_LIMIT_METHOD_FIFO: # FIFO: keep tail (newest), discard head file_paths_list = file_paths_list[-max_file_paths:] + file_paths_list.append(f"...{file_path_placeholder}...(FIFO)") else: # KEEP: keep head (earliest), discard tail file_paths_list = file_paths_list[:max_file_paths] + file_paths_list.append(f"...{file_path_placeholder}...(KEEP Old)") - # Add + sign if has_placeholder is True, indicating actual file count is higher - - file_paths_list.append( - f"...{file_path_placeholder}({limit_method}:{max_file_paths}/{original_count_str})..." - ) logger.info( f"Limited `{src_id}`~`{tgt_id}`: file_path {original_count_str} -> {max_file_paths} ({limit_method})" ) @@ -2001,7 +1999,7 @@ async def _merge_edges_then_upsert( if limit_method == SOURCE_IDS_LIMIT_METHOD_FIFO: truncation_info = truncation_info_log else: - truncation_info = "Keep Old Chunks" + truncation_info = "KEEP Old" deduplicated_num = already_fragment + len(edges_data) - num_fragment dd_message = "" diff --git a/lightrag_webui/src/components/graph/PropertiesView.tsx b/lightrag_webui/src/components/graph/PropertiesView.tsx index 39b9a448..97411f29 100644 --- a/lightrag_webui/src/components/graph/PropertiesView.tsx +++ b/lightrag_webui/src/components/graph/PropertiesView.tsx @@ -222,7 +222,7 @@ const PropertyRow = ({ // If this is source_id field and truncate info exists, append it to the tooltip if (name === 'source_id' && truncate) { - formattedTooltip += `\n(Truncation-${truncate})` + formattedTooltip += `\n(Truncated: ${truncate})` } // Use EditablePropertyRow for editable fields (description, entity_id and keywords) From 3ad616be4f2b4e4f35f2904b6ae3264a322a0800 Mon Sep 17 00:00:00 2001 From: yangdx Date: Tue, 21 Oct 2025 16:12:11 +0800 Subject: [PATCH 20/25] Change default source IDs limit method from KEEP to FIFO --- env.example | 6 ++++-- lightrag/constants.py | 9 ++++++--- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/env.example b/env.example index 73f2d7b7..4b86d79f 100644 --- a/env.example +++ b/env.example @@ -138,8 +138,10 @@ SUMMARY_LANGUAGE=English ### control the maximum chunk_ids stored in vector and graph db # MAX_SOURCE_IDS_PER_ENTITY=300 # MAX_SOURCE_IDS_PER_RELATION=300 -### control chunk_ids limitation method: KEEP, FIFO (KEEP: Keep oldest, FIFO: First in first out) -# SOURCE_IDS_LIMIT_METHOD=KEEP +### control chunk_ids limitation method: FIFO, FIFO +### FIFO: First in first out +### KEEP: Keep oldest (less merge action and faster) +# SOURCE_IDS_LIMIT_METHOD=FIFO ### Maximum number of file paths stored in entity/relation file_path field # MAX_FILE_PATHS=30 diff --git a/lightrag/constants.py b/lightrag/constants.py index 7c2b2701..f4e06e11 100644 --- a/lightrag/constants.py +++ b/lightrag/constants.py @@ -60,9 +60,12 @@ DEFAULT_RERANK_BINDING = "null" # Default source ids limit in meta data for entity and relation DEFAULT_MAX_SOURCE_IDS_PER_ENTITY = 300 DEFAULT_MAX_SOURCE_IDS_PER_RELATION = 300 -SOURCE_IDS_LIMIT_METHOD_KEEP = "KEEP" # Keep oldest -SOURCE_IDS_LIMIT_METHOD_FIFO = "FIFO" # First In First Out (Keep newest) -DEFAULT_SOURCE_IDS_LIMIT_METHOD = SOURCE_IDS_LIMIT_METHOD_KEEP +### control chunk_ids limitation method: FIFO, FIFO +### FIFO: First in first out +### KEEP: Keep oldest (less merge action and faster) +SOURCE_IDS_LIMIT_METHOD_KEEP = "KEEP" +SOURCE_IDS_LIMIT_METHOD_FIFO = "FIFO" +DEFAULT_SOURCE_IDS_LIMIT_METHOD = SOURCE_IDS_LIMIT_METHOD_FIFO VALID_SOURCE_IDS_LIMIT_METHODS = { SOURCE_IDS_LIMIT_METHOD_KEEP, SOURCE_IDS_LIMIT_METHOD_FIFO, From 3ed2abd82c1b0335c652d1f46edebcdb04b2bc1d Mon Sep 17 00:00:00 2001 From: yangdx Date: Tue, 21 Oct 2025 16:20:34 +0800 Subject: [PATCH 21/25] Improve logging to show source ID ratios when skipping entities/edges --- lightrag/operate.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/lightrag/operate.py b/lightrag/operate.py index c3816cec..c5f370f3 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -1555,7 +1555,9 @@ async def _merge_nodes_then_upsert( and not nodes_data ): if already_node: - logger.info(f"Skipped `{entity_name}`: KEEP old chunks") + logger.info( + f"Skipped `{entity_name}`: KEEP old chunks {already_source_ids}/{len(full_source_ids)}" + ) existing_node_data = dict(already_node) return existing_node_data else: @@ -1862,7 +1864,9 @@ async def _merge_edges_then_upsert( and not edges_data ): if already_edge: - logger.info(f"Skipped `{src_id}`~`{tgt_id}`: KEEP old chunks") + logger.info( + f"Skipped `{src_id}`~`{tgt_id}`: KEEP old chunks {already_source_ids}/{len(full_source_ids)}" + ) existing_edge_data = dict(already_edge) return existing_edge_data else: From e5e16b7bd171848c5b973a24eda53ae636ffea31 Mon Sep 17 00:00:00 2001 From: yangdx Date: Tue, 21 Oct 2025 16:27:04 +0800 Subject: [PATCH 22/25] Fix Redis data migration error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit • Use proper Redis connection context • Fix namespace pattern for key scanning • Propagate storage check exceptions • Remove defensive error swallowing --- lightrag/kg/redis_impl.py | 11 ++++++----- lightrag/lightrag.py | 4 ++-- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/lightrag/kg/redis_impl.py b/lightrag/kg/redis_impl.py index 8a393497..2e9a7d43 100644 --- a/lightrag/kg/redis_impl.py +++ b/lightrag/kg/redis_impl.py @@ -368,12 +368,13 @@ class RedisKVStorage(BaseKVStorage): Returns: bool: True if storage is empty, False otherwise """ - pattern = f"{self.namespace}:{self.workspace}:*" + pattern = f"{self.final_namespace}:*" try: - # Use scan to check if any keys exist - async for key in self.redis.scan_iter(match=pattern, count=1): - return False # Found at least one key - return True # No keys found + async with self._get_redis_connection() as redis: + # Use scan to check if any keys exist + async for key in redis.scan_iter(match=pattern, count=1): + return False # Found at least one key + return True # No keys found except Exception as e: logger.error(f"[{self.workspace}] Error checking if storage is empty: {e}") return True diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 4380a276..afd1de76 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -887,13 +887,13 @@ class LightRAG: need_entity_migration = await self.entity_chunks.is_empty() except Exception as exc: # pragma: no cover - defensive logging logger.error(f"Failed to check entity chunks storage: {exc}") - need_entity_migration = True + raise exc try: need_relation_migration = await self.relation_chunks.is_empty() except Exception as exc: # pragma: no cover - defensive logging logger.error(f"Failed to check relation chunks storage: {exc}") - need_relation_migration = True + raise exc if not need_entity_migration and not need_relation_migration: return From 88a45523e26f799e9e51d149301d7923f14350a5 Mon Sep 17 00:00:00 2001 From: yangdx Date: Tue, 21 Oct 2025 17:33:00 +0800 Subject: [PATCH 23/25] Increase default max file paths from 30 to 100 and improve documentation - Bump DEFAULT_MAX_FILE_PATHS to 100 - Add clarifying comment about display --- env.example | 5 +++-- lightrag/constants.py | 4 ++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/env.example b/env.example index 4b86d79f..3fd824ba 100644 --- a/env.example +++ b/env.example @@ -142,8 +142,9 @@ SUMMARY_LANGUAGE=English ### FIFO: First in first out ### KEEP: Keep oldest (less merge action and faster) # SOURCE_IDS_LIMIT_METHOD=FIFO -### Maximum number of file paths stored in entity/relation file_path field -# MAX_FILE_PATHS=30 + +# Maximum number of file paths stored in entity/relation file_path field (For displayed only, does not affect query performance) +# MAX_FILE_PATHS=100 ### maximum number of related chunks per source entity or relation ### The chunk picker uses this value to determine the total number of chunks selected from KG(knowledge graph) diff --git a/lightrag/constants.py b/lightrag/constants.py index f4e06e11..0d02edbf 100644 --- a/lightrag/constants.py +++ b/lightrag/constants.py @@ -70,8 +70,8 @@ VALID_SOURCE_IDS_LIMIT_METHODS = { SOURCE_IDS_LIMIT_METHOD_KEEP, SOURCE_IDS_LIMIT_METHOD_FIFO, } -# Default file_path limit in meta data for entity and relation (Use same limit method as source_ids) -DEFAULT_MAX_FILE_PATHS = 30 +# Maximum number of file paths stored in entity/relation file_path field (For displayed only, does not affect query performance) +DEFAULT_MAX_FILE_PATHS = 100 # Field length of file_path in Milvus Schema for entity and relation (Should not be changed) # file_path must store all file paths up to the DEFAULT_MAX_FILE_PATHS limit within the metadata. From fe890fca1598ce01ca7783069493c125ecd7fbb7 Mon Sep 17 00:00:00 2001 From: yangdx Date: Tue, 21 Oct 2025 18:34:06 +0800 Subject: [PATCH 24/25] Improve formatting of limit method info in rebuild functions --- lightrag/operate.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lightrag/operate.py b/lightrag/operate.py index c5f370f3..d4a86977 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -1189,7 +1189,7 @@ async def _rebuild_single_entity( file_paths_list = file_paths_list[:max_file_paths] file_paths_list.append( - f"...{file_path_placeholder}({limit_method}:{max_file_paths}/{original_count})..." + f"...{file_path_placeholder}...({limit_method} {max_file_paths}/{original_count})" ) logger.info( f"Limited `{entity_name}`: file_path {original_count} -> {max_file_paths} ({limit_method})" @@ -1223,7 +1223,7 @@ async def _rebuild_single_entity( if len(limited_chunk_ids) < len(normalized_chunk_ids): truncation_info = ( - f"{limit_method}:{len(limited_chunk_ids)}/{len(normalized_chunk_ids)}" + f"{limit_method} {len(limited_chunk_ids)}/{len(normalized_chunk_ids)}" ) else: truncation_info = "" @@ -1348,7 +1348,7 @@ async def _rebuild_single_relationship( file_paths_list = file_paths_list[:max_file_paths] file_paths_list.append( - f"...{file_path_placeholder}({limit_method}:{max_file_paths}/{original_count})..." + f"...{file_path_placeholder}...({limit_method} {max_file_paths}/{original_count})" ) logger.info( f"Limited `{src}`~`{tgt}`: file_path {original_count} -> {max_file_paths} ({limit_method})" @@ -1384,7 +1384,7 @@ async def _rebuild_single_relationship( if len(limited_chunk_ids) < len(normalized_chunk_ids): truncation_info = ( - f"{limit_method}:{len(limited_chunk_ids)}/{len(normalized_chunk_ids)}" + f"{limit_method} {len(limited_chunk_ids)}/{len(normalized_chunk_ids)}" ) else: truncation_info = "" From a809245aed58fe68a1c1a75e0d63d7db7f327978 Mon Sep 17 00:00:00 2001 From: yangdx Date: Tue, 21 Oct 2025 18:57:54 +0800 Subject: [PATCH 25/25] Preserve file path order by using lists instead of sets --- lightrag/operate.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/lightrag/operate.py b/lightrag/operate.py index d4a86977..b3adb67d 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -1026,7 +1026,7 @@ async def _rebuild_single_entity( async def _update_entity_storage( final_description: str, entity_type: str, - file_paths: set[str], + file_paths: list[str], source_chunk_ids: list[str], truncation_info: str = "", ): @@ -1195,8 +1195,6 @@ async def _rebuild_single_entity( f"Limited `{entity_name}`: file_path {original_count} -> {max_file_paths} ({limit_method})" ) - file_paths = set(file_paths_list) - # Remove duplicates while preserving order description_list = list(dict.fromkeys(descriptions)) entity_types = list(dict.fromkeys(entity_types)) @@ -1231,7 +1229,7 @@ async def _rebuild_single_entity( await _update_entity_storage( final_description, entity_type, - file_paths, + file_paths_list, limited_chunk_ids, truncation_info, ) @@ -1354,8 +1352,6 @@ async def _rebuild_single_relationship( f"Limited `{src}`~`{tgt}`: file_path {original_count} -> {max_file_paths} ({limit_method})" ) - file_paths = set(file_paths_list) - # Remove duplicates while preserving order description_list = list(dict.fromkeys(descriptions)) keywords = list(dict.fromkeys(keywords)) @@ -1398,8 +1394,8 @@ async def _rebuild_single_relationship( "keywords": combined_keywords, "weight": weight, "source_id": GRAPH_FIELD_SEP.join(limited_chunk_ids), - "file_path": GRAPH_FIELD_SEP.join([fp for fp in file_paths if fp]) - if file_paths + "file_path": GRAPH_FIELD_SEP.join([fp for fp in file_paths_list if fp]) + if file_paths_list else current_relationship.get("file_path", "unknown_source"), "truncate": truncation_info, }