From b03bb48e24afd1af9741722b0259df1676498442 Mon Sep 17 00:00:00 2001
From: yangdx <gzdaniel@me.com>
Date: Mon, 14 Jul 2025 01:55:04 +0800
Subject: [PATCH] feat: Refine summary logic and add dedicated Ollama num_ctx
 config

- Refactor the trigger condition for LLM-based summarization of entities and relations. Instead of relying on character length, the summary is now triggered when the number of merged description fragments exceeds a configured threshold. This provides a more robust and logical condition for consolidation.
- Introduce the `OLLAMA_NUM_CTX` environment variable to explicitly configure the context window size (`num_ctx`) for Ollama models. This decouples the model's context length from the `MAX_TOKENS` parameter, which is now specifically used to limit input for summary generation, making the configuration clearer and more flexible.
- Updated `README` files, `env.example`, and default values to reflect these changes.
---
 README-zh.md                    |  2 +-
 README.md                       |  2 +-
 env.example                     |  6 +++---
 lightrag/api/README-zh.md       | 10 ++++++----
 lightrag/api/README.md          |  6 ++++--
 lightrag/api/config.py          |  5 ++++-
 lightrag/api/lightrag_server.py |  2 +-
 lightrag/lightrag.py            |  2 +-
 lightrag/operate.py             | 16 +++++++++++-----
 9 files changed, 32 insertions(+), 19 deletions(-)

diff --git a/README-zh.md b/README-zh.md
index d6aef2c8..8b377e0e 100644
--- a/README-zh.md
+++ b/README-zh.md
@@ -250,7 +250,7 @@ if __name__ == "__main__":
 | **embedding_func_max_async** | `int` | 最大并发异步嵌入进程数 | `16` |
 | **llm_model_func** | `callable` | LLM生成的函数 | `gpt_4o_mini_complete` |
 | **llm_model_name** | `str` | 用于生成的LLM模型名称 | `meta-llama/Llama-3.2-1B-Instruct` |
-| **llm_model_max_token_size** | `int` | LLM生成的最大令牌大小（影响实体关系摘要） | `32768`（默认值由环境变量MAX_TOKENS更改） |
+| **llm_model_max_token_size** | `int` | 生成实体关系摘要时送给LLM的最大令牌数 | `32000`（默认值由环境变量MAX_TOKENS更改） |
 | **llm_model_max_async** | `int` | 最大并发异步LLM进程数 | `4`（默认值由环境变量MAX_ASYNC更改） |
 | **llm_model_kwargs** | `dict` | LLM生成的附加参数 | |
 | **vector_db_storage_cls_kwargs** | `dict` | 向量数据库的附加参数，如设置节点和关系检索的阈值 | cosine_better_than_threshold: 0.2（默认值由环境变量COSINE_THRESHOLD更改） |
diff --git a/README.md b/README.md
index 5fb6149b..5d8a642f 100644
--- a/README.md
+++ b/README.md
@@ -257,7 +257,7 @@ A full list of LightRAG init parameters:
 | **embedding_func_max_async** | `int` | Maximum number of concurrent asynchronous embedding processes | `16` |
 | **llm_model_func** | `callable` | Function for LLM generation | `gpt_4o_mini_complete` |
 | **llm_model_name** | `str` | LLM model name for generation | `meta-llama/Llama-3.2-1B-Instruct` |
-| **llm_model_max_token_size** | `int` | Maximum token size for LLM generation (affects entity relation summaries) | `32768`（default value changed by env var MAX_TOKENS) |
+| **llm_model_max_token_size** | `int` | Maximum tokens send to LLM to generate entity relation summaries | `32000`（default value changed by env var MAX_TOKENS) |
 | **llm_model_max_async** | `int` | Maximum number of concurrent asynchronous LLM processes | `4`（default value changed by env var MAX_ASYNC) |
 | **llm_model_kwargs** | `dict` | Additional parameters for LLM generation | |
 | **vector_db_storage_cls_kwargs** | `dict` | Additional parameters for vector database, like setting the threshold for nodes and relations retrieval | cosine_better_than_threshold: 0.2（default value changed by env var COSINE_THRESHOLD) |
diff --git a/env.example b/env.example
index 24fb11a1..f8f6d614 100644
--- a/env.example
+++ b/env.example
@@ -46,7 +46,6 @@ OLLAMA_EMULATING_MODEL_TAG=latest
 ### Chunk size for document splitting, 500~1500 is recommended
 # CHUNK_SIZE=1200
 # CHUNK_OVERLAP_SIZE=100
-# MAX_TOKEN_SUMMARY=500
 
 ### RAG Query Configuration
 # HISTORY_TURNS=3
@@ -91,8 +90,7 @@ TEMPERATURE=0
 ### Max concurrency requests of LLM
 MAX_ASYNC=4
 ### MAX_TOKENS: max tokens send to LLM for entity relation summaries (less than context size of the model)
-### MAX_TOKENS: set as num_ctx option for Ollama by API Server
-MAX_TOKENS=32768
+MAX_TOKENS=32000
 ### LLM Binding type: openai, ollama, lollms, azure_openai
 LLM_BINDING=openai
 LLM_MODEL=gpt-4o
@@ -101,6 +99,8 @@ LLM_BINDING_API_KEY=your_api_key
 ### Optional for Azure
 # AZURE_OPENAI_API_VERSION=2024-08-01-preview
 # AZURE_OPENAI_DEPLOYMENT=gpt-4o
+### set as num_ctx option for Ollama LLM
+# OLLAMA_NUM_CTX=32768
 
 ### Embedding Configuration
 ### Embedding Binding type: openai, ollama, lollms, azure_openai
diff --git a/lightrag/api/README-zh.md b/lightrag/api/README-zh.md
index 1c33d835..7d70853f 100644
--- a/lightrag/api/README-zh.md
+++ b/lightrag/api/README-zh.md
@@ -54,8 +54,8 @@ LLM_BINDING=openai
 LLM_MODEL=gpt-4o
 LLM_BINDING_HOST=https://api.openai.com/v1
 LLM_BINDING_API_KEY=your_api_key
-### 发送给 LLM 的最大 token 数（小于模型上下文大小）
-MAX_TOKENS=32768
+### 发送给 LLM 进行实体关系摘要的最大 token 数（小于模型上下文大小）
+MAX_TOKENS=32000
 
 EMBEDDING_BINDING=ollama
 EMBEDDING_BINDING_HOST=http://localhost:11434
@@ -71,8 +71,10 @@ LLM_BINDING=ollama
 LLM_MODEL=mistral-nemo:latest
 LLM_BINDING_HOST=http://localhost:11434
 # LLM_BINDING_API_KEY=your_api_key
-### 发送给 LLM 的最大 token 数（基于您的 Ollama 服务器容量）
-MAX_TOKENS=8192
+### 发送给 LLM 进行实体关系摘要的最大 token 数（小于模型上下文大小）
+MAX_TOKENS=7500
+###  Ollama 服务器上下文 token 数（基于您的 Ollama 服务器容量）
+OLLAMA_NUM_CTX=8192
 
 EMBEDDING_BINDING=ollama
 EMBEDDING_BINDING_HOST=http://localhost:11434
diff --git a/lightrag/api/README.md b/lightrag/api/README.md
index 915ad7f3..4ba9b2cf 100644
--- a/lightrag/api/README.md
+++ b/lightrag/api/README.md
@@ -71,8 +71,10 @@ LLM_BINDING=ollama
 LLM_MODEL=mistral-nemo:latest
 LLM_BINDING_HOST=http://localhost:11434
 # LLM_BINDING_API_KEY=your_api_key
-### Max tokens sent to LLM (based on your Ollama Server capacity)
-MAX_TOKENS=8192
+### Max tokens sent to LLM for entity relation description summarization (Less than LLM context length)
+MAX_TOKENS=7500
+###  Ollama Server context length
+OLLAMA_NUM_CTX=8192
 
 EMBEDDING_BINDING=ollama
 EMBEDDING_BINDING_HOST=http://localhost:11434
diff --git a/lightrag/api/config.py b/lightrag/api/config.py
index 70147bde..e8a9cea3 100644
--- a/lightrag/api/config.py
+++ b/lightrag/api/config.py
@@ -108,7 +108,7 @@ def parse_args() -> argparse.Namespace:
     parser.add_argument(
         "--max-tokens",
         type=int,
-        default=get_env_value("MAX_TOKENS", 32768, int),
+        default=get_env_value("MAX_TOKENS", 32000, int),
         help="Maximum token size (default: from env or 32768)",
     )
 
@@ -270,6 +270,9 @@ def parse_args() -> argparse.Namespace:
         args.llm_binding = "openai"
         args.embedding_binding = "ollama"
 
+    # Ollama ctx_num
+    args.ollama_num_ctx = get_env_value("OLLAMA_NUM_CTX", 32768, int)
+
     args.llm_binding_host = get_env_value(
         "LLM_BINDING_HOST", get_default_host(args.llm_binding)
     )
diff --git a/lightrag/api/lightrag_server.py b/lightrag/api/lightrag_server.py
index 693ed48f..bd0154c9 100644
--- a/lightrag/api/lightrag_server.py
+++ b/lightrag/api/lightrag_server.py
@@ -336,7 +336,7 @@ def create_app(args):
             llm_model_kwargs={
                 "host": args.llm_binding_host,
                 "timeout": args.timeout,
-                "options": {"num_ctx": args.max_tokens},
+                "options": {"num_ctx": args.ollama_num_ctx},
                 "api_key": args.llm_binding_api_key,
             }
             if args.llm_binding == "lollms" or args.llm_binding == "ollama"
diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py
index 6b85906a..b6cca32a 100644
--- a/lightrag/lightrag.py
+++ b/lightrag/lightrag.py
@@ -231,7 +231,7 @@ class LightRAG:
     llm_model_name: str = field(default="gpt-4o-mini")
     """Name of the LLM model used for generating responses."""
 
-    llm_model_max_token_size: int = field(default=int(os.getenv("MAX_TOKENS", 32768)))
+    llm_model_max_token_size: int = field(default=int(os.getenv("MAX_TOKENS", 32000)))
     """Maximum number of tokens allowed per LLM response."""
 
     llm_model_max_async: int = field(default=int(os.getenv("MAX_ASYNC", 4)))
diff --git a/lightrag/operate.py b/lightrag/operate.py
index 9eb060bb..49de3c71 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -118,7 +118,7 @@ async def _handle_entity_relation_summary(
 
     tokenizer: Tokenizer = global_config["tokenizer"]
     llm_max_tokens = global_config["llm_model_max_token_size"]
-    summary_max_tokens = global_config["summary_to_max_tokens"]
+    # summary_max_tokens = global_config["summary_to_max_tokens"]
 
     language = global_config["addon_params"].get(
         "language", PROMPTS["DEFAULT_LANGUAGE"]
@@ -145,7 +145,7 @@ async def _handle_entity_relation_summary(
         use_prompt,
         use_llm_func,
         llm_response_cache=llm_response_cache,
-        max_tokens=summary_max_tokens,
+        # max_tokens=summary_max_tokens,
         cache_type="extract",
     )
     return summary
@@ -687,7 +687,10 @@ async def _rebuild_single_entity(
 
     # Helper function to generate final description with optional LLM summary
     async def _generate_final_description(combined_description: str) -> str:
-        if len(combined_description) > global_config["summary_to_max_tokens"]:
+        force_llm_summary_on_merge = global_config["force_llm_summary_on_merge"]
+        num_fragment = combined_description.count(GRAPH_FIELD_SEP) + 1
+
+        if num_fragment >= force_llm_summary_on_merge:
             return await _handle_entity_relation_summary(
                 entity_name,
                 combined_description,
@@ -842,8 +845,11 @@ async def _rebuild_single_relationship(
     # )
     weight = sum(weights) if weights else current_relationship.get("weight", 1.0)
 
-    # Use summary if description is too long
-    if len(combined_description) > global_config["summary_to_max_tokens"]:
+    # Use summary if description has too many fragments
+    force_llm_summary_on_merge = global_config["force_llm_summary_on_merge"]
+    num_fragment = combined_description.count(GRAPH_FIELD_SEP) + 1
+
+    if num_fragment >= force_llm_summary_on_merge:
         final_description = await _handle_entity_relation_summary(
             f"{src}-{tgt}",
             combined_description,