diff --git a/env.example b/env.example index 4c8d355d..1d2b81f3 100644 --- a/env.example +++ b/env.example @@ -73,6 +73,8 @@ ENABLE_LLM_CACHE=true # MAX_RELATION_TOKENS=8000 ### control the maximum tokens send to LLM (include entities, relations and chunks) # MAX_TOTAL_TOKENS=30000 +### control the maximum chunk_ids stored in vector db +# MAX_CHUNK_IDS_PER_ENTITY=500 ### maximum number of related chunks per source entity or relation ### The chunk picker uses this value to determine the total number of chunks selected from KG(knowledge graph) diff --git a/lightrag/operate.py b/lightrag/operate.py index 29a17e68..34a8a613 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -53,7 +53,6 @@ from .constants import ( DEFAULT_KG_CHUNK_PICK_METHOD, DEFAULT_ENTITY_TYPES, DEFAULT_SUMMARY_LANGUAGE, - DEFAULT_MAX_CHUNK_IDS_PER_ENTITY, ) from .kg.shared_storage import get_storage_keyed_lock import time diff --git a/lightrag/utils.py b/lightrag/utils.py index 17ee43a6..b33c5a15 100644 --- a/lightrag/utils.py +++ b/lightrag/utils.py @@ -2469,13 +2469,15 @@ def truncate_entity_source_id(chunk_ids: set, entity_name: str) -> set: """Limit chunk_ids, for entities that appear a HUGE no of times (To not break VDB hard upper limits)""" already_len: int = len(chunk_ids) - if already_len >= DEFAULT_MAX_CHUNK_IDS_PER_ENTITY: + max_chunk_ids_per_entity = get_env_value("MAX_CHUNK_IDS_PER_ENTITY", DEFAULT_MAX_CHUNK_IDS_PER_ENTITY, int) + + if already_len >= max_chunk_ids_per_entity: logger.warning( - f"Chunk Ids already exceeds {DEFAULT_MAX_CHUNK_IDS_PER_ENTITY} for {entity_name}, " + f"Chunk Ids already exceeds {max_chunk_ids_per_entity } for {entity_name}, " f"current size: {already_len} entries." ) - truncated_chunk_ids = set(list(chunk_ids)[0:DEFAULT_MAX_CHUNK_IDS_PER_ENTITY]) + truncated_chunk_ids = set(list(chunk_ids)[0:max_chunk_ids_per_entity ]) return truncated_chunk_ids