diff --git a/README-zh.md b/README-zh.md
index d6aef2c8..02d7295c 100644
--- a/README-zh.md
+++ b/README-zh.md
@@ -304,16 +304,14 @@ class QueryParam:
If None, keeps all chunks returned from initial retrieval.
"""
- max_token_for_text_unit: int = int(os.getenv("MAX_TOKEN_TEXT_CHUNK", "4000"))
- """Maximum number of tokens allowed for each retrieved text chunk."""
+ max_entity_tokens: int = int(os.getenv("MAX_ENTITY_TOKENS", "10000"))
+ """Maximum number of tokens allocated for entity context in unified token control system."""
- max_token_for_global_context: int = int(
- os.getenv("MAX_TOKEN_RELATION_DESC", "4000")
- )
- """Maximum number of tokens allocated for relationship descriptions in global retrieval."""
+ max_relation_tokens: int = int(os.getenv("MAX_RELATION_TOKENS", "10000"))
+ """Maximum number of tokens allocated for relationship context in unified token control system."""
- max_token_for_local_context: int = int(os.getenv("MAX_TOKEN_ENTITY_DESC", "4000"))
- """Maximum number of tokens allocated for entity descriptions in local retrieval."""
+ max_total_tokens: int = int(os.getenv("MAX_TOTAL_TOKENS", "32000"))
+ """Maximum total tokens budget for the entire query context (entities + relations + chunks + system prompt)."""
hl_keywords: list[str] = field(default_factory=list)
"""List of high-level keywords to prioritize in retrieval."""
diff --git a/README.md b/README.md
index 5fb6149b..a04eb1d7 100644
--- a/README.md
+++ b/README.md
@@ -311,16 +311,14 @@ class QueryParam:
If None, keeps all chunks returned from initial retrieval.
"""
- max_token_for_text_unit: int = int(os.getenv("MAX_TOKEN_TEXT_CHUNK", "4000"))
- """Maximum number of tokens allowed for each retrieved text chunk."""
+ max_entity_tokens: int = int(os.getenv("MAX_ENTITY_TOKENS", "10000"))
+ """Maximum number of tokens allocated for entity context in unified token control system."""
- max_token_for_global_context: int = int(
- os.getenv("MAX_TOKEN_RELATION_DESC", "4000")
- )
- """Maximum number of tokens allocated for relationship descriptions in global retrieval."""
+ max_relation_tokens: int = int(os.getenv("MAX_RELATION_TOKENS", "10000"))
+ """Maximum number of tokens allocated for relationship context in unified token control system."""
- max_token_for_local_context: int = int(os.getenv("MAX_TOKEN_ENTITY_DESC", "4000"))
- """Maximum number of tokens allocated for entity descriptions in local retrieval."""
+ max_total_tokens: int = int(os.getenv("MAX_TOTAL_TOKENS", "32000"))
+ """Maximum total tokens budget for the entire query context (entities + relations + chunks + system prompt)."""
conversation_history: list[dict[str, str]] = field(default_factory=list)
"""Stores past conversation history to maintain context.
diff --git a/env.example b/env.example
index 4515fe34..ec5d0bad 100644
--- a/env.example
+++ b/env.example
@@ -50,9 +50,12 @@ OLLAMA_EMULATING_MODEL_TAG=latest
### RAG Query Configuration
# HISTORY_TURNS=3
-# MAX_TOKEN_TEXT_CHUNK=6000
-# MAX_TOKEN_RELATION_DESC=4000
-# MAX_TOKEN_ENTITY_DESC=4000
+
+### These parameters provide more precise control over total token usage
+# MAX_ENTITY_TOKENS=10000
+# MAX_RELATION_TOKENS=10000
+# MAX_TOTAL_TOKENS=32000
+
# COSINE_THRESHOLD=0.2
### Number of entities or relations to retrieve from KG
# TOP_K=60
diff --git a/lightrag/api/routers/query_routes.py b/lightrag/api/routers/query_routes.py
index 0a0c6227..4005b599 100644
--- a/lightrag/api/routers/query_routes.py
+++ b/lightrag/api/routers/query_routes.py
@@ -61,22 +61,22 @@ class QueryRequest(BaseModel):
description="Number of text chunks to keep after reranking.",
)
- max_token_for_text_unit: Optional[int] = Field(
- gt=1,
+ max_entity_tokens: Optional[int] = Field(
default=None,
- description="Maximum number of tokens allowed for each retrieved text chunk.",
+ description="Maximum number of tokens allocated for entity context in unified token control system.",
+ ge=1,
)
- max_token_for_global_context: Optional[int] = Field(
- gt=1,
+ max_relation_tokens: Optional[int] = Field(
default=None,
- description="Maximum number of tokens allocated for relationship descriptions in global retrieval.",
+ description="Maximum number of tokens allocated for relationship context in unified token control system.",
+ ge=1,
)
- max_token_for_local_context: Optional[int] = Field(
- gt=1,
+ max_total_tokens: Optional[int] = Field(
default=None,
- description="Maximum number of tokens allocated for entity descriptions in local retrieval.",
+ description="Maximum total tokens budget for the entire query context (entities + relations + chunks + system prompt).",
+ ge=1,
)
conversation_history: Optional[List[Dict[str, Any]]] = Field(
diff --git a/lightrag/base.py b/lightrag/base.py
index 97564ac2..67d641ca 100644
--- a/lightrag/base.py
+++ b/lightrag/base.py
@@ -70,16 +70,14 @@ class QueryParam:
If None, keeps all chunks returned from initial retrieval.
"""
- max_token_for_text_unit: int = int(os.getenv("MAX_TOKEN_TEXT_CHUNK", "6000"))
- """Maximum number of tokens allowed for each retrieved text chunk."""
+ max_entity_tokens: int = int(os.getenv("MAX_ENTITY_TOKENS", "10000"))
+ """Maximum number of tokens allocated for entity context in unified token control system."""
- max_token_for_global_context: int = int(
- os.getenv("MAX_TOKEN_RELATION_DESC", "4000")
- )
- """Maximum number of tokens allocated for relationship descriptions in global retrieval."""
+ max_relation_tokens: int = int(os.getenv("MAX_RELATION_TOKENS", "10000"))
+ """Maximum number of tokens allocated for relationship context in unified token control system."""
- max_token_for_local_context: int = int(os.getenv("MAX_TOKEN_ENTITY_DESC", "4000"))
- """Maximum number of tokens allocated for entity descriptions in local retrieval."""
+ max_total_tokens: int = int(os.getenv("MAX_TOTAL_TOKENS", "32000"))
+ """Maximum total tokens budget for the entire query context (entities + relations + chunks + system prompt)."""
hl_keywords: list[str] = field(default_factory=list)
"""List of high-level keywords to prioritize in retrieval."""
diff --git a/lightrag/operate.py b/lightrag/operate.py
index be4499ab..668d42a9 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -1569,7 +1569,9 @@ async def kg_query(
tokenizer: Tokenizer = global_config["tokenizer"]
len_of_prompts = len(tokenizer.encode(query + sys_prompt))
- logger.debug(f"[kg_query]Prompt Tokens: {len_of_prompts}")
+ logger.debug(
+ f"[kg_query] Sending to LLM: {len_of_prompts:,} tokens (Query: {len(tokenizer.encode(query))}, System: {len(tokenizer.encode(sys_prompt))})"
+ )
response = await use_model_func(
query,
@@ -1692,7 +1694,9 @@ async def extract_keywords_only(
tokenizer: Tokenizer = global_config["tokenizer"]
len_of_prompts = len(tokenizer.encode(kw_prompt))
- logger.debug(f"[kg_query]Prompt Tokens: {len_of_prompts}")
+ logger.debug(
+ f"[extract_keywords] Sending to LLM: {len_of_prompts:,} tokens (Prompt: {len_of_prompts})"
+ )
# 5. Call the LLM for keyword extraction
if param.model_func:
@@ -1864,7 +1868,7 @@ async def _build_query_context(
# Combine entities and relations contexts
entities_context = process_combine_contexts(
- hl_entities_context, ll_entities_context
+ ll_entities_context, hl_entities_context
)
relations_context = process_combine_contexts(
hl_relations_context, ll_relations_context
@@ -1894,6 +1898,163 @@ async def _build_query_context(
f"Final context: {len(entities_context)} entities, {len(relations_context)} relations, {len(text_units_context)} chunks"
)
+ # Unified token control system - Apply precise token limits to entities and relations
+ tokenizer = text_chunks_db.global_config.get("tokenizer")
+ if tokenizer:
+ # Get new token limits from query_param (with fallback to global_config)
+ max_entity_tokens = getattr(
+ query_param,
+ "max_entity_tokens",
+ text_chunks_db.global_config.get("MAX_ENTITY_TOKENS", 8000),
+ )
+ max_relation_tokens = getattr(
+ query_param,
+ "max_relation_tokens",
+ text_chunks_db.global_config.get("MAX_RELATION_TOKENS", 6000),
+ )
+ max_total_tokens = getattr(
+ query_param,
+ "max_total_tokens",
+ text_chunks_db.global_config.get("MAX_TOTAL_TOKENS", 32000),
+ )
+
+ # Truncate entities based on complete JSON serialization
+ if entities_context:
+ original_entity_count = len(entities_context)
+ entities_context = truncate_list_by_token_size(
+ entities_context,
+ key=lambda x: json.dumps(x, ensure_ascii=False),
+ max_token_size=max_entity_tokens,
+ tokenizer=tokenizer,
+ )
+ if len(entities_context) < original_entity_count:
+ logger.debug(
+ f"Truncated entities: {original_entity_count} -> {len(entities_context)} (entity max tokens: {max_entity_tokens})"
+ )
+
+ # Truncate relations based on complete JSON serialization
+ if relations_context:
+ original_relation_count = len(relations_context)
+ relations_context = truncate_list_by_token_size(
+ relations_context,
+ key=lambda x: json.dumps(x, ensure_ascii=False),
+ max_token_size=max_relation_tokens,
+ tokenizer=tokenizer,
+ )
+ if len(relations_context) < original_relation_count:
+ logger.debug(
+ f"Truncated relations: {original_relation_count} -> {len(relations_context)} (relation max tokens: {max_relation_tokens})"
+ )
+
+ # Calculate dynamic token limit for text chunks
+ entities_str = json.dumps(entities_context, ensure_ascii=False)
+ relations_str = json.dumps(relations_context, ensure_ascii=False)
+
+ # Calculate base context tokens (entities + relations + template)
+ kg_context_template = """-----Entities(KG)-----
+
+```json
+{entities_str}
+```
+
+-----Relationships(KG)-----
+
+```json
+{relations_str}
+```
+
+-----Document Chunks(DC)-----
+
+```json
+[]
+```
+
+"""
+ kg_context = kg_context_template.format(
+ entities_str=entities_str, relations_str=relations_str
+ )
+ kg_context_tokens = len(tokenizer.encode(kg_context))
+
+ # Calculate actual system prompt overhead dynamically
+ # 1. Calculate conversation history tokens
+ history_context = ""
+ if query_param.conversation_history:
+ history_context = get_conversation_turns(
+ query_param.conversation_history, query_param.history_turns
+ )
+ history_tokens = (
+ len(tokenizer.encode(history_context)) if history_context else 0
+ )
+
+ # 2. Calculate system prompt template tokens (excluding context_data)
+ user_prompt = query_param.user_prompt if query_param.user_prompt else ""
+ response_type = (
+ query_param.response_type
+ if query_param.response_type
+ else "Multiple Paragraphs"
+ )
+
+ # Get the system prompt template from PROMPTS
+ sys_prompt_template = text_chunks_db.global_config.get(
+ "system_prompt_template", PROMPTS["rag_response"]
+ )
+
+ # Create a sample system prompt with placeholders filled (excluding context_data)
+ sample_sys_prompt = sys_prompt_template.format(
+ history=history_context,
+ context_data="", # Empty for overhead calculation
+ response_type=response_type,
+ user_prompt=user_prompt,
+ )
+ sys_prompt_template_tokens = len(tokenizer.encode(sample_sys_prompt))
+
+ # Total system prompt overhead = template + query tokens
+ query_tokens = len(tokenizer.encode(query))
+ sys_prompt_overhead = sys_prompt_template_tokens + query_tokens
+
+ buffer_tokens = 100 # Safety buffer as requested
+
+ # Calculate available tokens for text chunks
+ used_tokens = kg_context_tokens + sys_prompt_overhead + buffer_tokens
+ available_chunk_tokens = max_total_tokens - used_tokens
+
+ logger.debug(
+ f"Token allocation - Total: {max_total_tokens}, History: {history_tokens}, SysPrompt: {sys_prompt_overhead}, KG: {kg_context_tokens}, Buffer: {buffer_tokens}, Available for chunks: {available_chunk_tokens}"
+ )
+
+ # Re-process chunks with dynamic token limit
+ if text_units_context:
+ # Create a temporary query_param copy with adjusted chunk token limit
+ temp_chunks = [
+ {"content": chunk["content"], "file_path": chunk["file_path"]}
+ for chunk in text_units_context
+ ]
+
+ # Apply token truncation to chunks using the dynamic limit
+ truncated_chunks = await process_chunks_unified(
+ query=query,
+ chunks=temp_chunks,
+ query_param=query_param,
+ global_config=text_chunks_db.global_config,
+ source_type="mixed",
+ chunk_token_limit=available_chunk_tokens, # Pass dynamic limit
+ )
+
+ # Rebuild text_units_context with truncated chunks
+ text_units_context = []
+ for i, chunk in enumerate(truncated_chunks):
+ text_units_context.append(
+ {
+ "id": i + 1,
+ "content": chunk["content"],
+ "file_path": chunk.get("file_path", "unknown_source"),
+ }
+ )
+
+ logger.debug(
+ f"Re-truncated chunks for dynamic token limit: {len(temp_chunks)} -> {len(text_units_context)} (chunk available tokens: {available_chunk_tokens})"
+ )
+
# not necessary to use LLM to generate a response
if not entities_context and not relations_context:
return None
@@ -1982,18 +2143,6 @@ async def _get_node_data(
knowledge_graph_inst,
)
- tokenizer: Tokenizer = text_chunks_db.global_config.get("tokenizer")
- len_node_datas = len(node_datas)
- node_datas = truncate_list_by_token_size(
- node_datas,
- key=lambda x: x["description"] if x["description"] is not None else "",
- max_token_size=query_param.max_token_for_local_context,
- tokenizer=tokenizer,
- )
- logger.debug(
- f"Truncate entities from {len_node_datas} to {len(node_datas)} (max tokens:{query_param.max_token_for_local_context})"
- )
-
logger.info(
f"Local query: {len(node_datas)} entites, {len(use_relations)} relations, {len(use_text_units)} chunks"
)
@@ -2199,20 +2348,9 @@ async def _find_most_related_edges_from_entities(
}
all_edges_data.append(combined)
- tokenizer: Tokenizer = knowledge_graph_inst.global_config.get("tokenizer")
all_edges_data = sorted(
all_edges_data, key=lambda x: (x["rank"], x["weight"]), reverse=True
)
- all_edges_data = truncate_list_by_token_size(
- all_edges_data,
- key=lambda x: x["description"] if x["description"] is not None else "",
- max_token_size=query_param.max_token_for_global_context,
- tokenizer=tokenizer,
- )
-
- logger.debug(
- f"Truncate relations from {len(all_edges)} to {len(all_edges_data)} (max tokens:{query_param.max_token_for_global_context})"
- )
return all_edges_data
@@ -2269,16 +2407,9 @@ async def _get_edge_data(
}
edge_datas.append(combined)
- tokenizer: Tokenizer = text_chunks_db.global_config.get("tokenizer")
edge_datas = sorted(
edge_datas, key=lambda x: (x["rank"], x["weight"]), reverse=True
)
- edge_datas = truncate_list_by_token_size(
- edge_datas,
- key=lambda x: x["description"] if x["description"] is not None else "",
- max_token_size=query_param.max_token_for_global_context,
- tokenizer=tokenizer,
- )
use_entities, use_text_units = await asyncio.gather(
_find_most_related_entities_from_relationships(
edge_datas,
@@ -2388,18 +2519,6 @@ async def _find_most_related_entities_from_relationships(
combined = {**node, "entity_name": entity_name, "rank": degree}
node_datas.append(combined)
- tokenizer: Tokenizer = knowledge_graph_inst.global_config.get("tokenizer")
- len_node_datas = len(node_datas)
- node_datas = truncate_list_by_token_size(
- node_datas,
- key=lambda x: x["description"] if x["description"] is not None else "",
- max_token_size=query_param.max_token_for_local_context,
- tokenizer=tokenizer,
- )
- logger.debug(
- f"Truncate entities from {len_node_datas} to {len(node_datas)} (max tokens:{query_param.max_token_for_local_context})"
- )
-
return node_datas
@@ -2491,13 +2610,64 @@ async def naive_query(
if chunks is None or len(chunks) == 0:
return PROMPTS["fail_response"]
- # Process chunks using unified processing
+ # Calculate dynamic token limit for chunks
+ # Get token limits from query_param (with fallback to global_config)
+ max_total_tokens = getattr(
+ query_param, "max_total_tokens", global_config.get("MAX_TOTAL_TOKENS", 32000)
+ )
+
+ # Calculate conversation history tokens
+ history_context = ""
+ if query_param.conversation_history:
+ history_context = get_conversation_turns(
+ query_param.conversation_history, query_param.history_turns
+ )
+ history_tokens = len(tokenizer.encode(history_context)) if history_context else 0
+
+ # Calculate system prompt template tokens (excluding content_data)
+ user_prompt = query_param.user_prompt if query_param.user_prompt else ""
+ response_type = (
+ query_param.response_type
+ if query_param.response_type
+ else "Multiple Paragraphs"
+ )
+
+ # Use the provided system prompt or default
+ sys_prompt_template = (
+ system_prompt if system_prompt else PROMPTS["naive_rag_response"]
+ )
+
+ # Create a sample system prompt with empty content_data to calculate overhead
+ sample_sys_prompt = sys_prompt_template.format(
+ content_data="", # Empty for overhead calculation
+ response_type=response_type,
+ history=history_context,
+ user_prompt=user_prompt,
+ )
+ sys_prompt_template_tokens = len(tokenizer.encode(sample_sys_prompt))
+
+ # Total system prompt overhead = template + query tokens
+ query_tokens = len(tokenizer.encode(query))
+ sys_prompt_overhead = sys_prompt_template_tokens + query_tokens
+
+ buffer_tokens = 100 # Safety buffer
+
+ # Calculate available tokens for chunks
+ used_tokens = sys_prompt_overhead + buffer_tokens
+ available_chunk_tokens = max_total_tokens - used_tokens
+
+ logger.debug(
+ f"Naive query token allocation - Total: {max_total_tokens}, History: {history_tokens}, SysPrompt: {sys_prompt_overhead}, Buffer: {buffer_tokens}, Available for chunks: {available_chunk_tokens}"
+ )
+
+ # Process chunks using unified processing with dynamic token limit
processed_chunks = await process_chunks_unified(
query=query,
chunks=chunks,
query_param=query_param,
global_config=global_config,
source_type="vector",
+ chunk_token_limit=available_chunk_tokens, # Pass dynamic limit
)
logger.info(f"Final context: {len(processed_chunks)} chunks")
@@ -2548,7 +2718,9 @@ async def naive_query(
return sys_prompt
len_of_prompts = len(tokenizer.encode(query + sys_prompt))
- logger.debug(f"[naive_query]Prompt Tokens: {len_of_prompts}")
+ logger.debug(
+ f"[naive_query] Sending to LLM: {len_of_prompts:,} tokens (Query: {len(tokenizer.encode(query))}, System: {len(tokenizer.encode(sys_prompt))})"
+ )
response = await use_model_func(
query,
@@ -2672,7 +2844,9 @@ async def kg_query_with_keywords(
tokenizer: Tokenizer = global_config["tokenizer"]
len_of_prompts = len(tokenizer.encode(query + sys_prompt))
- logger.debug(f"[kg_query_with_keywords]Prompt Tokens: {len_of_prompts}")
+ logger.debug(
+ f"[kg_query_with_keywords] Sending to LLM: {len_of_prompts:,} tokens (Query: {len(tokenizer.encode(query))}, System: {len(tokenizer.encode(sys_prompt))})"
+ )
# 6. Generate response
response = await use_model_func(
@@ -2849,6 +3023,7 @@ async def process_chunks_unified(
query_param: QueryParam,
global_config: dict,
source_type: str = "mixed",
+ chunk_token_limit: int = None, # Add parameter for dynamic token limit
) -> list[dict]:
"""
Unified processing for text chunks: deduplication, chunk_top_k limiting, reranking, and token truncation.
@@ -2859,6 +3034,7 @@ async def process_chunks_unified(
query_param: Query parameters containing configuration
global_config: Global configuration dictionary
source_type: Source type for logging ("vector", "entity", "relationship", "mixed")
+ chunk_token_limit: Dynamic token limit for chunks (if None, uses default)
Returns:
Processed and filtered list of text chunks
@@ -2901,16 +3077,25 @@ async def process_chunks_unified(
# 4. Token-based final truncation
tokenizer = global_config.get("tokenizer")
if tokenizer and unique_chunks:
+ # Set default chunk_token_limit if not provided
+ if chunk_token_limit is None:
+ # Get default from query_param or global_config
+ chunk_token_limit = getattr(
+ query_param,
+ "max_total_tokens",
+ global_config.get("MAX_TOTAL_TOKENS", 32000),
+ )
+
original_count = len(unique_chunks)
unique_chunks = truncate_list_by_token_size(
unique_chunks,
key=lambda x: x.get("content", ""),
- max_token_size=query_param.max_token_for_text_unit,
+ max_token_size=chunk_token_limit,
tokenizer=tokenizer,
)
logger.debug(
f"Token truncation: {len(unique_chunks)} chunks from {original_count} "
- f"(max tokens: {query_param.max_token_for_text_unit}, source: {source_type})"
+ f"(chunk available tokens: {chunk_token_limit}, source: {source_type})"
)
return unique_chunks
diff --git a/lightrag_webui/src/api/lightrag.ts b/lightrag_webui/src/api/lightrag.ts
index 24b299aa..77601ec7 100644
--- a/lightrag_webui/src/api/lightrag.ts
+++ b/lightrag_webui/src/api/lightrag.ts
@@ -90,12 +90,16 @@ export type QueryRequest = {
stream?: boolean
/** Number of top items to retrieve. Represents entities in 'local' mode and relationships in 'global' mode. */
top_k?: number
- /** Maximum number of tokens allowed for each retrieved text chunk. */
- max_token_for_text_unit?: number
- /** Maximum number of tokens allocated for relationship descriptions in global retrieval. */
- max_token_for_global_context?: number
- /** Maximum number of tokens allocated for entity descriptions in local retrieval. */
- max_token_for_local_context?: number
+ /** Maximum number of text chunks to retrieve and process. */
+ chunk_top_k?: number
+ /** Number of text chunks to keep after reranking. */
+ chunk_rerank_top_k?: number
+ /** Maximum number of tokens allocated for entity context in unified token control system. */
+ max_entity_tokens?: number
+ /** Maximum number of tokens allocated for relationship context in unified token control system. */
+ max_relation_tokens?: number
+ /** Maximum total tokens budget for the entire query context (entities + relations + chunks + system prompt). */
+ max_total_tokens?: number
/**
* Stores past conversation history to maintain context.
* Format: [{"role": "user/assistant", "content": "message"}].
diff --git a/lightrag_webui/src/components/retrieval/QuerySettings.tsx b/lightrag_webui/src/components/retrieval/QuerySettings.tsx
index 735a4190..b21f5b11 100644
--- a/lightrag_webui/src/components/retrieval/QuerySettings.tsx
+++ b/lightrag_webui/src/components/retrieval/QuerySettings.tsx
@@ -132,30 +132,81 @@ export default function QuerySettings() {
>
+ {/* Chunk Top K */}
+ <>
+ {t('retrievePanel.querySettings.chunkTopKTooltip')}
{t('retrievePanel.querySettings.chunkRerankTopKTooltip')}
+{t('retrievePanel.querySettings.maxTokensTextUnitTooltip')}
+{t('retrievePanel.querySettings.maxEntityTokensTooltip')}
{t('retrievePanel.querySettings.maxTokensGlobalContextTooltip')}
+{t('retrievePanel.querySettings.maxRelationTokensTooltip')}
{t('retrievePanel.querySettings.maxTokensLocalContextTooltip')}
+{t('retrievePanel.querySettings.maxTotalTokensTooltip')}