cherry-pick 17c2a929

2025-12-04 19:18:35 +08:00 · 2025-12-04 19:18:35 +08:00 · a3d7f4b985
commit a3d7f4b985
parent ec0d9bd763
5 changed files with 25 additions and 37 deletions
--- a/env.example
+++ b/env.example
@ -73,8 +73,8 @@ ENABLE_LLM_CACHE=true
 # MAX_RELATION_TOKENS=8000
 ### control the maximum tokens send to LLM (include entities, relations and chunks)
 # MAX_TOTAL_TOKENS=30000
-### control the maximum chunk_ids stored in vector db
-# MAX_CHUNK_IDS_PER_ENTITY=500
+### control the maximum chunk_ids stored
+# MAX_SOURCE_IDS_PER_ENTITY=500

 ### maximum number of related chunks per source entity or relation
 ###     The chunk picker uses this value to determine the total number of chunks selected from KG(knowledge graph)
--- a/lightrag/constants.py
+++ b/lightrag/constants.py
@ -13,7 +13,7 @@ DEFAULT_MAX_GRAPH_NODES = 1000
 # Default values for extraction settings
 DEFAULT_SUMMARY_LANGUAGE = "English"  # Default language for document processing
 DEFAULT_MAX_GLEANING = 1
-DEFAULT_ENTITY_NAME_MAX_LENGTH = 256
+DEFAULT_MAX_SOURCE_IDS_PER_ENTITY = 500 # Applies to Both Graph + Vector DBs

 # Number of description fragments to trigger LLM summary
 DEFAULT_FORCE_LLM_SUMMARY_ON_MERGE = 8
@ -38,7 +38,7 @@ DEFAULT_ENTITY_TYPES = [
    "NaturalObject",
 ]

-# Separator for: description, source_id and relation-key fields(Can not be changed after data inserted)
+# Separator for graph fields
 GRAPH_FIELD_SEP = "<SEP>"

 # Query and retrieval configuration defaults
@ -58,27 +58,8 @@ DEFAULT_HISTORY_TURNS = 0
 DEFAULT_MIN_RERANK_SCORE = 0.0
 DEFAULT_RERANK_BINDING = "null"

-# Default source ids limit in meta data for entity and relation
-DEFAULT_MAX_SOURCE_IDS_PER_ENTITY = 300
-DEFAULT_MAX_SOURCE_IDS_PER_RELATION = 300
-### control chunk_ids limitation method: FIFO, FIFO
-###    FIFO: First in first out
-###    KEEP: Keep oldest (less merge action and faster)
-SOURCE_IDS_LIMIT_METHOD_KEEP = "KEEP"
-SOURCE_IDS_LIMIT_METHOD_FIFO = "FIFO"
-DEFAULT_SOURCE_IDS_LIMIT_METHOD = SOURCE_IDS_LIMIT_METHOD_FIFO
-VALID_SOURCE_IDS_LIMIT_METHODS = {
-    SOURCE_IDS_LIMIT_METHOD_KEEP,
-    SOURCE_IDS_LIMIT_METHOD_FIFO,
-}
-# Maximum number of file paths stored in entity/relation file_path field (For displayed only, does not affect query performance)
-DEFAULT_MAX_FILE_PATHS = 100
-
-# Field length of file_path in Milvus Schema for entity and relation (Should not be changed)
-# file_path must store all file paths up to the DEFAULT_MAX_FILE_PATHS limit within the metadata.
+# File path configuration for vector and graph database(Should not be changed, used in Milvus Schema)
 DEFAULT_MAX_FILE_PATH_LENGTH = 32768
-# Placeholder for more file paths in meta data for entity and relation (Should not be changed)
-DEFAULT_FILE_PATH_MORE_PLACEHOLDER = "truncated"

 # Default temperature for LLM
 DEFAULT_TEMPERATURE = 1.0
--- a/lightrag/lightrag.py
+++ b/lightrag/lightrag.py
@ -39,6 +39,7 @@ from lightrag.constants import (
    DEFAULT_MAX_ASYNC,
    DEFAULT_MAX_PARALLEL_INSERT,
    DEFAULT_MAX_GRAPH_NODES,
+    DEFAULT_MAX_SOURCE_IDS_PER_ENTITY,
    DEFAULT_ENTITY_TYPES,
    DEFAULT_SUMMARY_LANGUAGE,
    DEFAULT_LLM_TIMEOUT,
@ -359,6 +360,11 @@ class LightRAG:
    )
    """Maximum number of graph nodes to return in knowledge graph queries."""

+    max_source_ids_per_entity: int = field(
+        default=get_env_value("MAX_SOURCE_IDS_PER_ENTITY", DEFAULT_MAX_SOURCE_IDS_PER_ENTITY, int)
+    )
+    """Maximum number of source (chunk) ids in entity Grpah + VDB."""
+
    addon_params: dict[str, Any] = field(
        default_factory=lambda: {
            "language": get_env_value(
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@ -1374,7 +1374,7 @@ async def _merge_nodes_then_upsert(

    merged_source_ids: set = set([dp["source_id"] for dp in nodes_data] + already_source_ids)

-    source_ids = truncate_entity_source_id(merged_source_ids, entity_name)
+    source_ids = truncate_entity_source_id(merged_source_ids, entity_name, global_config)
    source_id = GRAPH_FIELD_SEP.join(source_ids)

    file_path = build_file_path(already_file_paths, nodes_data, entity_name)
@ -1661,7 +1661,7 @@ async def merge_nodes_and_edges(
                [entity_name], namespace=namespace, enable_logging=False
            ):
                try:
-                    logger.info(f"Inserting {entity_name} in Graph")
+                    logger.debug(f"Inserting {entity_name} in Graph")
                    # Graph database operation (critical path, must succeed)
                    entity_data = await _merge_nodes_then_upsert(
                        entity_name,
@ -1690,7 +1690,7 @@ async def merge_nodes_and_edges(
                        }


-                        logger.info(f"Inserting {entity_name} in Graph")
+                        logger.debug(f"Inserting {entity_name} in Graph")
                        # Use safe operation wrapper - VDB failure must throw exception
                        await safe_vdb_operation_with_exception(
                            operation=lambda: entity_vdb.upsert(data_for_vdb),
--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@ -26,7 +26,6 @@ from lightrag.constants import (
    GRAPH_FIELD_SEP,
    DEFAULT_MAX_TOTAL_TOKENS,
    DEFAULT_MAX_FILE_PATH_LENGTH,
-    DEFAULT_MAX_CHUNK_IDS_PER_ENTITY,
 )

 # Initialize logger with basic configuration
@ -2465,23 +2464,25 @@ async def process_chunks_unified(

    return final_chunks

-def truncate_entity_source_id(chunk_ids: set, entity_name: str) -> set:
+def truncate_entity_source_id(chunk_ids: set, entity_name: str, global_config: dict) -> set:
    """Limit chunk_ids, for entities that appear a HUGE no of times (To not break VDB hard upper limits)"""
    already_len: int = len(chunk_ids)

-    max_chunk_ids_per_entity = get_env_value("MAX_CHUNK_IDS_PER_ENTITY", DEFAULT_MAX_CHUNK_IDS_PER_ENTITY, int)
+    max_chunk_ids_per_entity = global_config["max_source_ids_per_entity"]
+
+    if already_len <= max_chunk_ids_per_entity:
+        return chunk_ids
+
+    logger.warning(
+        f"Source Ids already exceeds {max_chunk_ids_per_entity } for {entity_name}, "
+        f"current size: {already_len}, truncating..."
+    )

-    if already_len >= max_chunk_ids_per_entity:
-        logger.warning(
-            f"Chunk Ids already exceeds {max_chunk_ids_per_entity } for {entity_name}, "
-            f"current size: {already_len} entries."
-        )
-    
    truncated_chunk_ids = set(list(chunk_ids)[0:max_chunk_ids_per_entity ])
-
    return truncated_chunk_ids


+
 def build_file_path(already_file_paths, data_list, target):
    """Build file path string with UTF-8 byte length limit and deduplication