Merge pull request #1778 from danielaskdd/add-ollama-num-ctx

feat: Refine summary logic and add dedicated Ollama num_ctx config
This commit is contained in:
Daniel.y 2025-07-14 02:13:08 +08:00 committed by GitHub
commit 375bfd57a4
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 32 additions and 19 deletions

View file

@ -250,7 +250,7 @@ if __name__ == "__main__":
| **embedding_func_max_async** | `int` | 最大并发异步嵌入进程数 | `16` |
| **llm_model_func** | `callable` | LLM生成的函数 | `gpt_4o_mini_complete` |
| **llm_model_name** | `str` | 用于生成的LLM模型名称 | `meta-llama/Llama-3.2-1B-Instruct` |
| **llm_model_max_token_size** | `int` | LLM生成的最大令牌大小影响实体关系摘要 | `32768`默认值由环境变量MAX_TOKENS更改 |
| **llm_model_max_token_size** | `int` | 生成实体关系摘要时送给LLM的最大令牌数 | `32000`默认值由环境变量MAX_TOKENS更改 |
| **llm_model_max_async** | `int` | 最大并发异步LLM进程数 | `4`默认值由环境变量MAX_ASYNC更改 |
| **llm_model_kwargs** | `dict` | LLM生成的附加参数 | |
| **vector_db_storage_cls_kwargs** | `dict` | 向量数据库的附加参数,如设置节点和关系检索的阈值 | cosine_better_than_threshold: 0.2默认值由环境变量COSINE_THRESHOLD更改 |

View file

@ -257,7 +257,7 @@ A full list of LightRAG init parameters:
| **embedding_func_max_async** | `int` | Maximum number of concurrent asynchronous embedding processes | `16` |
| **llm_model_func** | `callable` | Function for LLM generation | `gpt_4o_mini_complete` |
| **llm_model_name** | `str` | LLM model name for generation | `meta-llama/Llama-3.2-1B-Instruct` |
| **llm_model_max_token_size** | `int` | Maximum token size for LLM generation (affects entity relation summaries) | `32768`default value changed by env var MAX_TOKENS) |
| **llm_model_max_token_size** | `int` | Maximum tokens send to LLM to generate entity relation summaries | `32000`default value changed by env var MAX_TOKENS) |
| **llm_model_max_async** | `int` | Maximum number of concurrent asynchronous LLM processes | `4`default value changed by env var MAX_ASYNC) |
| **llm_model_kwargs** | `dict` | Additional parameters for LLM generation | |
| **vector_db_storage_cls_kwargs** | `dict` | Additional parameters for vector database, like setting the threshold for nodes and relations retrieval | cosine_better_than_threshold: 0.2default value changed by env var COSINE_THRESHOLD) |

View file

@ -46,7 +46,6 @@ OLLAMA_EMULATING_MODEL_TAG=latest
### Chunk size for document splitting, 500~1500 is recommended
# CHUNK_SIZE=1200
# CHUNK_OVERLAP_SIZE=100
# MAX_TOKEN_SUMMARY=500
### RAG Query Configuration
# HISTORY_TURNS=3
@ -91,8 +90,7 @@ TEMPERATURE=0
### Max concurrency requests of LLM
MAX_ASYNC=4
### MAX_TOKENS: max tokens send to LLM for entity relation summaries (less than context size of the model)
### MAX_TOKENS: set as num_ctx option for Ollama by API Server
MAX_TOKENS=32768
MAX_TOKENS=32000
### LLM Binding type: openai, ollama, lollms, azure_openai
LLM_BINDING=openai
LLM_MODEL=gpt-4o
@ -101,6 +99,8 @@ LLM_BINDING_API_KEY=your_api_key
### Optional for Azure
# AZURE_OPENAI_API_VERSION=2024-08-01-preview
# AZURE_OPENAI_DEPLOYMENT=gpt-4o
### set as num_ctx option for Ollama LLM
# OLLAMA_NUM_CTX=32768
### Embedding Configuration
### Embedding Binding type: openai, ollama, lollms, azure_openai

View file

@ -54,8 +54,8 @@ LLM_BINDING=openai
LLM_MODEL=gpt-4o
LLM_BINDING_HOST=https://api.openai.com/v1
LLM_BINDING_API_KEY=your_api_key
### 发送给 LLM 的最大 token 数(小于模型上下文大小)
MAX_TOKENS=32768
### 发送给 LLM 进行实体关系摘要的最大 token 数(小于模型上下文大小)
MAX_TOKENS=32000
EMBEDDING_BINDING=ollama
EMBEDDING_BINDING_HOST=http://localhost:11434
@ -71,8 +71,10 @@ LLM_BINDING=ollama
LLM_MODEL=mistral-nemo:latest
LLM_BINDING_HOST=http://localhost:11434
# LLM_BINDING_API_KEY=your_api_key
### 发送给 LLM 的最大 token 数(基于您的 Ollama 服务器容量)
MAX_TOKENS=8192
### 发送给 LLM 进行实体关系摘要的最大 token 数(小于模型上下文大小)
MAX_TOKENS=7500
### Ollama 服务器上下文 token 数(基于您的 Ollama 服务器容量)
OLLAMA_NUM_CTX=8192
EMBEDDING_BINDING=ollama
EMBEDDING_BINDING_HOST=http://localhost:11434

View file

@ -71,8 +71,10 @@ LLM_BINDING=ollama
LLM_MODEL=mistral-nemo:latest
LLM_BINDING_HOST=http://localhost:11434
# LLM_BINDING_API_KEY=your_api_key
### Max tokens sent to LLM (based on your Ollama Server capacity)
MAX_TOKENS=8192
### Max tokens sent to LLM for entity relation description summarization (Less than LLM context length)
MAX_TOKENS=7500
### Ollama Server context length
OLLAMA_NUM_CTX=8192
EMBEDDING_BINDING=ollama
EMBEDDING_BINDING_HOST=http://localhost:11434

View file

@ -108,7 +108,7 @@ def parse_args() -> argparse.Namespace:
parser.add_argument(
"--max-tokens",
type=int,
default=get_env_value("MAX_TOKENS", 32768, int),
default=get_env_value("MAX_TOKENS", 32000, int),
help="Maximum token size (default: from env or 32768)",
)
@ -270,6 +270,9 @@ def parse_args() -> argparse.Namespace:
args.llm_binding = "openai"
args.embedding_binding = "ollama"
# Ollama ctx_num
args.ollama_num_ctx = get_env_value("OLLAMA_NUM_CTX", 32768, int)
args.llm_binding_host = get_env_value(
"LLM_BINDING_HOST", get_default_host(args.llm_binding)
)

View file

@ -336,7 +336,7 @@ def create_app(args):
llm_model_kwargs={
"host": args.llm_binding_host,
"timeout": args.timeout,
"options": {"num_ctx": args.max_tokens},
"options": {"num_ctx": args.ollama_num_ctx},
"api_key": args.llm_binding_api_key,
}
if args.llm_binding == "lollms" or args.llm_binding == "ollama"

View file

@ -231,7 +231,7 @@ class LightRAG:
llm_model_name: str = field(default="gpt-4o-mini")
"""Name of the LLM model used for generating responses."""
llm_model_max_token_size: int = field(default=int(os.getenv("MAX_TOKENS", 32768)))
llm_model_max_token_size: int = field(default=int(os.getenv("MAX_TOKENS", 32000)))
"""Maximum number of tokens allowed per LLM response."""
llm_model_max_async: int = field(default=int(os.getenv("MAX_ASYNC", 4)))

View file

@ -118,7 +118,7 @@ async def _handle_entity_relation_summary(
tokenizer: Tokenizer = global_config["tokenizer"]
llm_max_tokens = global_config["llm_model_max_token_size"]
summary_max_tokens = global_config["summary_to_max_tokens"]
# summary_max_tokens = global_config["summary_to_max_tokens"]
language = global_config["addon_params"].get(
"language", PROMPTS["DEFAULT_LANGUAGE"]
@ -145,7 +145,7 @@ async def _handle_entity_relation_summary(
use_prompt,
use_llm_func,
llm_response_cache=llm_response_cache,
max_tokens=summary_max_tokens,
# max_tokens=summary_max_tokens,
cache_type="extract",
)
return summary
@ -687,7 +687,10 @@ async def _rebuild_single_entity(
# Helper function to generate final description with optional LLM summary
async def _generate_final_description(combined_description: str) -> str:
if len(combined_description) > global_config["summary_to_max_tokens"]:
force_llm_summary_on_merge = global_config["force_llm_summary_on_merge"]
num_fragment = combined_description.count(GRAPH_FIELD_SEP) + 1
if num_fragment >= force_llm_summary_on_merge:
return await _handle_entity_relation_summary(
entity_name,
combined_description,
@ -842,8 +845,11 @@ async def _rebuild_single_relationship(
# )
weight = sum(weights) if weights else current_relationship.get("weight", 1.0)
# Use summary if description is too long
if len(combined_description) > global_config["summary_to_max_tokens"]:
# Use summary if description has too many fragments
force_llm_summary_on_merge = global_config["force_llm_summary_on_merge"]
num_fragment = combined_description.count(GRAPH_FIELD_SEP) + 1
if num_fragment >= force_llm_summary_on_merge:
final_description = await _handle_entity_relation_summary(
f"{src}-{tgt}",
combined_description,