Merge pull request #1778 from danielaskdd/add-ollama-num-ctx

feat: Refine summary logic and add dedicated Ollama num_ctx config
2025-07-14 02:13:08 +08:00 · 2025-07-14 02:13:08 +08:00 · 375bfd57a4
commit 375bfd57a4
parent e8b3dfcf90 b03bb48e24
9 changed files with 32 additions and 19 deletions
--- a/README-zh.md
+++ b/README-zh.md
@ -250,7 +250,7 @@ if __name__ == "__main__":
 | **embedding_func_max_async** | `int` | 最大并发异步嵌入进程数 | `16` |
 | **llm_model_func** | `callable` | LLM生成的函数 | `gpt_4o_mini_complete` |
 | **llm_model_name** | `str` | 用于生成的LLM模型名称 | `meta-llama/Llama-3.2-1B-Instruct` |
-| **llm_model_max_token_size** | `int` | LLM生成的最大令牌大小（影响实体关系摘要） | `32768`（默认值由环境变量MAX_TOKENS更改） |
+| **llm_model_max_token_size** | `int` | 生成实体关系摘要时送给LLM的最大令牌数 | `32000`（默认值由环境变量MAX_TOKENS更改） |
 | **llm_model_max_async** | `int` | 最大并发异步LLM进程数 | `4`（默认值由环境变量MAX_ASYNC更改） |
 | **llm_model_kwargs** | `dict` | LLM生成的附加参数 | |
 | **vector_db_storage_cls_kwargs** | `dict` | 向量数据库的附加参数，如设置节点和关系检索的阈值 | cosine_better_than_threshold: 0.2（默认值由环境变量COSINE_THRESHOLD更改） |
--- a/README.md
+++ b/README.md
@ -257,7 +257,7 @@ A full list of LightRAG init parameters:
 | **embedding_func_max_async** | `int` | Maximum number of concurrent asynchronous embedding processes | `16` |
 | **llm_model_func** | `callable` | Function for LLM generation | `gpt_4o_mini_complete` |
 | **llm_model_name** | `str` | LLM model name for generation | `meta-llama/Llama-3.2-1B-Instruct` |
-| **llm_model_max_token_size** | `int` | Maximum token size for LLM generation (affects entity relation summaries) | `32768`（default value changed by env var MAX_TOKENS) |
+| **llm_model_max_token_size** | `int` | Maximum tokens send to LLM to generate entity relation summaries | `32000`（default value changed by env var MAX_TOKENS) |
 | **llm_model_max_async** | `int` | Maximum number of concurrent asynchronous LLM processes | `4`（default value changed by env var MAX_ASYNC) |
 | **llm_model_kwargs** | `dict` | Additional parameters for LLM generation | |
 | **vector_db_storage_cls_kwargs** | `dict` | Additional parameters for vector database, like setting the threshold for nodes and relations retrieval | cosine_better_than_threshold: 0.2（default value changed by env var COSINE_THRESHOLD) |
--- a/env.example
+++ b/env.example
@ -46,7 +46,6 @@ OLLAMA_EMULATING_MODEL_TAG=latest
 ### Chunk size for document splitting, 500~1500 is recommended
 # CHUNK_SIZE=1200
 # CHUNK_OVERLAP_SIZE=100
-# MAX_TOKEN_SUMMARY=500

 ### RAG Query Configuration
 # HISTORY_TURNS=3
@ -91,8 +90,7 @@ TEMPERATURE=0
 ### Max concurrency requests of LLM
 MAX_ASYNC=4
 ### MAX_TOKENS: max tokens send to LLM for entity relation summaries (less than context size of the model)
-### MAX_TOKENS: set as num_ctx option for Ollama by API Server
-MAX_TOKENS=32768
+MAX_TOKENS=32000
 ### LLM Binding type: openai, ollama, lollms, azure_openai
 LLM_BINDING=openai
 LLM_MODEL=gpt-4o
@ -101,6 +99,8 @@ LLM_BINDING_API_KEY=your_api_key
 ### Optional for Azure
 # AZURE_OPENAI_API_VERSION=2024-08-01-preview
 # AZURE_OPENAI_DEPLOYMENT=gpt-4o
+### set as num_ctx option for Ollama LLM
+# OLLAMA_NUM_CTX=32768

 ### Embedding Configuration
 ### Embedding Binding type: openai, ollama, lollms, azure_openai
--- a/lightrag/api/README-zh.md
+++ b/lightrag/api/README-zh.md
@ -54,8 +54,8 @@ LLM_BINDING=openai
 LLM_MODEL=gpt-4o
 LLM_BINDING_HOST=https://api.openai.com/v1
 LLM_BINDING_API_KEY=your_api_key
-### 发送给 LLM 的最大 token 数（小于模型上下文大小）
-MAX_TOKENS=32768
+### 发送给 LLM 进行实体关系摘要的最大 token 数（小于模型上下文大小）
+MAX_TOKENS=32000

 EMBEDDING_BINDING=ollama
 EMBEDDING_BINDING_HOST=http://localhost:11434
@ -71,8 +71,10 @@ LLM_BINDING=ollama
 LLM_MODEL=mistral-nemo:latest
 LLM_BINDING_HOST=http://localhost:11434
 # LLM_BINDING_API_KEY=your_api_key
-### 发送给 LLM 的最大 token 数（基于您的 Ollama 服务器容量）
-MAX_TOKENS=8192
+### 发送给 LLM 进行实体关系摘要的最大 token 数（小于模型上下文大小）
+MAX_TOKENS=7500
+###  Ollama 服务器上下文 token 数（基于您的 Ollama 服务器容量）
+OLLAMA_NUM_CTX=8192

 EMBEDDING_BINDING=ollama
 EMBEDDING_BINDING_HOST=http://localhost:11434
--- a/lightrag/api/README.md
+++ b/lightrag/api/README.md
@ -71,8 +71,10 @@ LLM_BINDING=ollama
 LLM_MODEL=mistral-nemo:latest
 LLM_BINDING_HOST=http://localhost:11434
 # LLM_BINDING_API_KEY=your_api_key
-### Max tokens sent to LLM (based on your Ollama Server capacity)
-MAX_TOKENS=8192
+### Max tokens sent to LLM for entity relation description summarization (Less than LLM context length)
+MAX_TOKENS=7500
+###  Ollama Server context length
+OLLAMA_NUM_CTX=8192

 EMBEDDING_BINDING=ollama
 EMBEDDING_BINDING_HOST=http://localhost:11434
--- a/lightrag/api/config.py
+++ b/lightrag/api/config.py
@ -108,7 +108,7 @@ def parse_args() -> argparse.Namespace:
    parser.add_argument(
        "--max-tokens",
        type=int,
-        default=get_env_value("MAX_TOKENS", 32768, int),
+        default=get_env_value("MAX_TOKENS", 32000, int),
        help="Maximum token size (default: from env or 32768)",
    )

@ -270,6 +270,9 @@ def parse_args() -> argparse.Namespace:
        args.llm_binding = "openai"
        args.embedding_binding = "ollama"

+    # Ollama ctx_num
+    args.ollama_num_ctx = get_env_value("OLLAMA_NUM_CTX", 32768, int)
+
    args.llm_binding_host = get_env_value(
        "LLM_BINDING_HOST", get_default_host(args.llm_binding)
    )
--- a/lightrag/api/lightrag_server.py
+++ b/lightrag/api/lightrag_server.py
@ -336,7 +336,7 @@ def create_app(args):
            llm_model_kwargs={
                "host": args.llm_binding_host,
                "timeout": args.timeout,
-                "options": {"num_ctx": args.max_tokens},
+                "options": {"num_ctx": args.ollama_num_ctx},
                "api_key": args.llm_binding_api_key,
            }
            if args.llm_binding == "lollms" or args.llm_binding == "ollama"
--- a/lightrag/lightrag.py
+++ b/lightrag/lightrag.py
@ -231,7 +231,7 @@ class LightRAG:
    llm_model_name: str = field(default="gpt-4o-mini")
    """Name of the LLM model used for generating responses."""

-    llm_model_max_token_size: int = field(default=int(os.getenv("MAX_TOKENS", 32768)))
+    llm_model_max_token_size: int = field(default=int(os.getenv("MAX_TOKENS", 32000)))
    """Maximum number of tokens allowed per LLM response."""

    llm_model_max_async: int = field(default=int(os.getenv("MAX_ASYNC", 4)))
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@ -118,7 +118,7 @@ async def _handle_entity_relation_summary(

    tokenizer: Tokenizer = global_config["tokenizer"]
    llm_max_tokens = global_config["llm_model_max_token_size"]
-    summary_max_tokens = global_config["summary_to_max_tokens"]
+    # summary_max_tokens = global_config["summary_to_max_tokens"]

    language = global_config["addon_params"].get(
        "language", PROMPTS["DEFAULT_LANGUAGE"]
@ -145,7 +145,7 @@ async def _handle_entity_relation_summary(
        use_prompt,
        use_llm_func,
        llm_response_cache=llm_response_cache,
-        max_tokens=summary_max_tokens,
+        # max_tokens=summary_max_tokens,
        cache_type="extract",
    )
    return summary
@ -687,7 +687,10 @@ async def _rebuild_single_entity(

    # Helper function to generate final description with optional LLM summary
    async def _generate_final_description(combined_description: str) -> str:
-        if len(combined_description) > global_config["summary_to_max_tokens"]:
+        force_llm_summary_on_merge = global_config["force_llm_summary_on_merge"]
+        num_fragment = combined_description.count(GRAPH_FIELD_SEP) + 1
+
+        if num_fragment >= force_llm_summary_on_merge:
            return await _handle_entity_relation_summary(
                entity_name,
                combined_description,
@ -842,8 +845,11 @@ async def _rebuild_single_relationship(
    # )
    weight = sum(weights) if weights else current_relationship.get("weight", 1.0)

-    # Use summary if description is too long
-    if len(combined_description) > global_config["summary_to_max_tokens"]:
+    # Use summary if description has too many fragments
+    force_llm_summary_on_merge = global_config["force_llm_summary_on_merge"]
+    num_fragment = combined_description.count(GRAPH_FIELD_SEP) + 1
+
+    if num_fragment >= force_llm_summary_on_merge:
        final_description = await _handle_entity_relation_summary(
            f"{src}-{tgt}",
            combined_description,