From b03bb48e24afd1af9741722b0259df1676498442 Mon Sep 17 00:00:00 2001 From: yangdx Date: Mon, 14 Jul 2025 01:55:04 +0800 Subject: [PATCH] feat: Refine summary logic and add dedicated Ollama num_ctx config - Refactor the trigger condition for LLM-based summarization of entities and relations. Instead of relying on character length, the summary is now triggered when the number of merged description fragments exceeds a configured threshold. This provides a more robust and logical condition for consolidation. - Introduce the `OLLAMA_NUM_CTX` environment variable to explicitly configure the context window size (`num_ctx`) for Ollama models. This decouples the model's context length from the `MAX_TOKENS` parameter, which is now specifically used to limit input for summary generation, making the configuration clearer and more flexible. - Updated `README` files, `env.example`, and default values to reflect these changes. --- README-zh.md | 2 +- README.md | 2 +- env.example | 6 +++--- lightrag/api/README-zh.md | 10 ++++++---- lightrag/api/README.md | 6 ++++-- lightrag/api/config.py | 5 ++++- lightrag/api/lightrag_server.py | 2 +- lightrag/lightrag.py | 2 +- lightrag/operate.py | 16 +++++++++++----- 9 files changed, 32 insertions(+), 19 deletions(-) diff --git a/README-zh.md b/README-zh.md index d6aef2c8..8b377e0e 100644 --- a/README-zh.md +++ b/README-zh.md @@ -250,7 +250,7 @@ if __name__ == "__main__": | **embedding_func_max_async** | `int` | 最大并发异步嵌入进程数 | `16` | | **llm_model_func** | `callable` | LLM生成的函数 | `gpt_4o_mini_complete` | | **llm_model_name** | `str` | 用于生成的LLM模型名称 | `meta-llama/Llama-3.2-1B-Instruct` | -| **llm_model_max_token_size** | `int` | LLM生成的最大令牌大小(影响实体关系摘要) | `32768`(默认值由环境变量MAX_TOKENS更改) | +| **llm_model_max_token_size** | `int` | 生成实体关系摘要时送给LLM的最大令牌数 | `32000`(默认值由环境变量MAX_TOKENS更改) | | **llm_model_max_async** | `int` | 最大并发异步LLM进程数 | `4`(默认值由环境变量MAX_ASYNC更改) | | **llm_model_kwargs** | `dict` | LLM生成的附加参数 | | | **vector_db_storage_cls_kwargs** | `dict` | 向量数据库的附加参数,如设置节点和关系检索的阈值 | cosine_better_than_threshold: 0.2(默认值由环境变量COSINE_THRESHOLD更改) | diff --git a/README.md b/README.md index 5fb6149b..5d8a642f 100644 --- a/README.md +++ b/README.md @@ -257,7 +257,7 @@ A full list of LightRAG init parameters: | **embedding_func_max_async** | `int` | Maximum number of concurrent asynchronous embedding processes | `16` | | **llm_model_func** | `callable` | Function for LLM generation | `gpt_4o_mini_complete` | | **llm_model_name** | `str` | LLM model name for generation | `meta-llama/Llama-3.2-1B-Instruct` | -| **llm_model_max_token_size** | `int` | Maximum token size for LLM generation (affects entity relation summaries) | `32768`(default value changed by env var MAX_TOKENS) | +| **llm_model_max_token_size** | `int` | Maximum tokens send to LLM to generate entity relation summaries | `32000`(default value changed by env var MAX_TOKENS) | | **llm_model_max_async** | `int` | Maximum number of concurrent asynchronous LLM processes | `4`(default value changed by env var MAX_ASYNC) | | **llm_model_kwargs** | `dict` | Additional parameters for LLM generation | | | **vector_db_storage_cls_kwargs** | `dict` | Additional parameters for vector database, like setting the threshold for nodes and relations retrieval | cosine_better_than_threshold: 0.2(default value changed by env var COSINE_THRESHOLD) | diff --git a/env.example b/env.example index 24fb11a1..f8f6d614 100644 --- a/env.example +++ b/env.example @@ -46,7 +46,6 @@ OLLAMA_EMULATING_MODEL_TAG=latest ### Chunk size for document splitting, 500~1500 is recommended # CHUNK_SIZE=1200 # CHUNK_OVERLAP_SIZE=100 -# MAX_TOKEN_SUMMARY=500 ### RAG Query Configuration # HISTORY_TURNS=3 @@ -91,8 +90,7 @@ TEMPERATURE=0 ### Max concurrency requests of LLM MAX_ASYNC=4 ### MAX_TOKENS: max tokens send to LLM for entity relation summaries (less than context size of the model) -### MAX_TOKENS: set as num_ctx option for Ollama by API Server -MAX_TOKENS=32768 +MAX_TOKENS=32000 ### LLM Binding type: openai, ollama, lollms, azure_openai LLM_BINDING=openai LLM_MODEL=gpt-4o @@ -101,6 +99,8 @@ LLM_BINDING_API_KEY=your_api_key ### Optional for Azure # AZURE_OPENAI_API_VERSION=2024-08-01-preview # AZURE_OPENAI_DEPLOYMENT=gpt-4o +### set as num_ctx option for Ollama LLM +# OLLAMA_NUM_CTX=32768 ### Embedding Configuration ### Embedding Binding type: openai, ollama, lollms, azure_openai diff --git a/lightrag/api/README-zh.md b/lightrag/api/README-zh.md index 1c33d835..7d70853f 100644 --- a/lightrag/api/README-zh.md +++ b/lightrag/api/README-zh.md @@ -54,8 +54,8 @@ LLM_BINDING=openai LLM_MODEL=gpt-4o LLM_BINDING_HOST=https://api.openai.com/v1 LLM_BINDING_API_KEY=your_api_key -### 发送给 LLM 的最大 token 数(小于模型上下文大小) -MAX_TOKENS=32768 +### 发送给 LLM 进行实体关系摘要的最大 token 数(小于模型上下文大小) +MAX_TOKENS=32000 EMBEDDING_BINDING=ollama EMBEDDING_BINDING_HOST=http://localhost:11434 @@ -71,8 +71,10 @@ LLM_BINDING=ollama LLM_MODEL=mistral-nemo:latest LLM_BINDING_HOST=http://localhost:11434 # LLM_BINDING_API_KEY=your_api_key -### 发送给 LLM 的最大 token 数(基于您的 Ollama 服务器容量) -MAX_TOKENS=8192 +### 发送给 LLM 进行实体关系摘要的最大 token 数(小于模型上下文大小) +MAX_TOKENS=7500 +### Ollama 服务器上下文 token 数(基于您的 Ollama 服务器容量) +OLLAMA_NUM_CTX=8192 EMBEDDING_BINDING=ollama EMBEDDING_BINDING_HOST=http://localhost:11434 diff --git a/lightrag/api/README.md b/lightrag/api/README.md index 915ad7f3..4ba9b2cf 100644 --- a/lightrag/api/README.md +++ b/lightrag/api/README.md @@ -71,8 +71,10 @@ LLM_BINDING=ollama LLM_MODEL=mistral-nemo:latest LLM_BINDING_HOST=http://localhost:11434 # LLM_BINDING_API_KEY=your_api_key -### Max tokens sent to LLM (based on your Ollama Server capacity) -MAX_TOKENS=8192 +### Max tokens sent to LLM for entity relation description summarization (Less than LLM context length) +MAX_TOKENS=7500 +### Ollama Server context length +OLLAMA_NUM_CTX=8192 EMBEDDING_BINDING=ollama EMBEDDING_BINDING_HOST=http://localhost:11434 diff --git a/lightrag/api/config.py b/lightrag/api/config.py index 70147bde..e8a9cea3 100644 --- a/lightrag/api/config.py +++ b/lightrag/api/config.py @@ -108,7 +108,7 @@ def parse_args() -> argparse.Namespace: parser.add_argument( "--max-tokens", type=int, - default=get_env_value("MAX_TOKENS", 32768, int), + default=get_env_value("MAX_TOKENS", 32000, int), help="Maximum token size (default: from env or 32768)", ) @@ -270,6 +270,9 @@ def parse_args() -> argparse.Namespace: args.llm_binding = "openai" args.embedding_binding = "ollama" + # Ollama ctx_num + args.ollama_num_ctx = get_env_value("OLLAMA_NUM_CTX", 32768, int) + args.llm_binding_host = get_env_value( "LLM_BINDING_HOST", get_default_host(args.llm_binding) ) diff --git a/lightrag/api/lightrag_server.py b/lightrag/api/lightrag_server.py index 693ed48f..bd0154c9 100644 --- a/lightrag/api/lightrag_server.py +++ b/lightrag/api/lightrag_server.py @@ -336,7 +336,7 @@ def create_app(args): llm_model_kwargs={ "host": args.llm_binding_host, "timeout": args.timeout, - "options": {"num_ctx": args.max_tokens}, + "options": {"num_ctx": args.ollama_num_ctx}, "api_key": args.llm_binding_api_key, } if args.llm_binding == "lollms" or args.llm_binding == "ollama" diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 6b85906a..b6cca32a 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -231,7 +231,7 @@ class LightRAG: llm_model_name: str = field(default="gpt-4o-mini") """Name of the LLM model used for generating responses.""" - llm_model_max_token_size: int = field(default=int(os.getenv("MAX_TOKENS", 32768))) + llm_model_max_token_size: int = field(default=int(os.getenv("MAX_TOKENS", 32000))) """Maximum number of tokens allowed per LLM response.""" llm_model_max_async: int = field(default=int(os.getenv("MAX_ASYNC", 4))) diff --git a/lightrag/operate.py b/lightrag/operate.py index 9eb060bb..49de3c71 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -118,7 +118,7 @@ async def _handle_entity_relation_summary( tokenizer: Tokenizer = global_config["tokenizer"] llm_max_tokens = global_config["llm_model_max_token_size"] - summary_max_tokens = global_config["summary_to_max_tokens"] + # summary_max_tokens = global_config["summary_to_max_tokens"] language = global_config["addon_params"].get( "language", PROMPTS["DEFAULT_LANGUAGE"] @@ -145,7 +145,7 @@ async def _handle_entity_relation_summary( use_prompt, use_llm_func, llm_response_cache=llm_response_cache, - max_tokens=summary_max_tokens, + # max_tokens=summary_max_tokens, cache_type="extract", ) return summary @@ -687,7 +687,10 @@ async def _rebuild_single_entity( # Helper function to generate final description with optional LLM summary async def _generate_final_description(combined_description: str) -> str: - if len(combined_description) > global_config["summary_to_max_tokens"]: + force_llm_summary_on_merge = global_config["force_llm_summary_on_merge"] + num_fragment = combined_description.count(GRAPH_FIELD_SEP) + 1 + + if num_fragment >= force_llm_summary_on_merge: return await _handle_entity_relation_summary( entity_name, combined_description, @@ -842,8 +845,11 @@ async def _rebuild_single_relationship( # ) weight = sum(weights) if weights else current_relationship.get("weight", 1.0) - # Use summary if description is too long - if len(combined_description) > global_config["summary_to_max_tokens"]: + # Use summary if description has too many fragments + force_llm_summary_on_merge = global_config["force_llm_summary_on_merge"] + num_fragment = combined_description.count(GRAPH_FIELD_SEP) + 1 + + if num_fragment >= force_llm_summary_on_merge: final_description = await _handle_entity_relation_summary( f"{src}-{tgt}", combined_description,