Update token limit

2025-07-14 15:53:48 +08:00 · 2025-07-14 15:53:48 +08:00 · ef2115d437
commit ef2115d437
parent ba0cffd853
14 changed files with 459 additions and 172 deletions
--- a/README-zh.md
+++ b/README-zh.md
@ -304,16 +304,14 @@ class QueryParam:
    If None, keeps all chunks returned from initial retrieval.
    """

-    max_token_for_text_unit: int = int(os.getenv("MAX_TOKEN_TEXT_CHUNK", "4000"))
-    """Maximum number of tokens allowed for each retrieved text chunk."""
+    max_entity_tokens: int = int(os.getenv("MAX_ENTITY_TOKENS", "10000"))
+    """Maximum number of tokens allocated for entity context in unified token control system."""

-    max_token_for_global_context: int = int(
-        os.getenv("MAX_TOKEN_RELATION_DESC", "4000")
-    )
-    """Maximum number of tokens allocated for relationship descriptions in global retrieval."""
+    max_relation_tokens: int = int(os.getenv("MAX_RELATION_TOKENS", "10000"))
+    """Maximum number of tokens allocated for relationship context in unified token control system."""

-    max_token_for_local_context: int = int(os.getenv("MAX_TOKEN_ENTITY_DESC", "4000"))
-    """Maximum number of tokens allocated for entity descriptions in local retrieval."""
+    max_total_tokens: int = int(os.getenv("MAX_TOTAL_TOKENS", "32000"))
+    """Maximum total tokens budget for the entire query context (entities + relations + chunks + system prompt)."""

    hl_keywords: list[str] = field(default_factory=list)
    """List of high-level keywords to prioritize in retrieval."""
--- a/README.md
+++ b/README.md
@ -311,16 +311,14 @@ class QueryParam:
    If None, keeps all chunks returned from initial retrieval.
    """

-    max_token_for_text_unit: int = int(os.getenv("MAX_TOKEN_TEXT_CHUNK", "4000"))
-    """Maximum number of tokens allowed for each retrieved text chunk."""
+    max_entity_tokens: int = int(os.getenv("MAX_ENTITY_TOKENS", "10000"))
+    """Maximum number of tokens allocated for entity context in unified token control system."""

-    max_token_for_global_context: int = int(
-        os.getenv("MAX_TOKEN_RELATION_DESC", "4000")
-    )
-    """Maximum number of tokens allocated for relationship descriptions in global retrieval."""
+    max_relation_tokens: int = int(os.getenv("MAX_RELATION_TOKENS", "10000"))
+    """Maximum number of tokens allocated for relationship context in unified token control system."""

-    max_token_for_local_context: int = int(os.getenv("MAX_TOKEN_ENTITY_DESC", "4000"))
-    """Maximum number of tokens allocated for entity descriptions in local retrieval."""
+    max_total_tokens: int = int(os.getenv("MAX_TOTAL_TOKENS", "32000"))
+    """Maximum total tokens budget for the entire query context (entities + relations + chunks + system prompt)."""

    conversation_history: list[dict[str, str]] = field(default_factory=list)
    """Stores past conversation history to maintain context.
--- a/env.example
+++ b/env.example
@ -50,9 +50,12 @@ OLLAMA_EMULATING_MODEL_TAG=latest

 ### RAG Query Configuration
 # HISTORY_TURNS=3
-# MAX_TOKEN_TEXT_CHUNK=6000
-# MAX_TOKEN_RELATION_DESC=4000
-# MAX_TOKEN_ENTITY_DESC=4000
+
+### These parameters provide more precise control over total token usage
+# MAX_ENTITY_TOKENS=10000
+# MAX_RELATION_TOKENS=10000
+# MAX_TOTAL_TOKENS=32000
+
 # COSINE_THRESHOLD=0.2
 ### Number of entities or relations to retrieve from KG
 # TOP_K=60
--- a/lightrag/api/routers/query_routes.py
+++ b/lightrag/api/routers/query_routes.py
@ -61,22 +61,22 @@ class QueryRequest(BaseModel):
        description="Number of text chunks to keep after reranking.",
    )

-    max_token_for_text_unit: Optional[int] = Field(
-        gt=1,
+    max_entity_tokens: Optional[int] = Field(
        default=None,
-        description="Maximum number of tokens allowed for each retrieved text chunk.",
+        description="Maximum number of tokens allocated for entity context in unified token control system.",
+        ge=1,
    )

-    max_token_for_global_context: Optional[int] = Field(
-        gt=1,
+    max_relation_tokens: Optional[int] = Field(
        default=None,
-        description="Maximum number of tokens allocated for relationship descriptions in global retrieval.",
+        description="Maximum number of tokens allocated for relationship context in unified token control system.",
+        ge=1,
    )

-    max_token_for_local_context: Optional[int] = Field(
-        gt=1,
+    max_total_tokens: Optional[int] = Field(
        default=None,
-        description="Maximum number of tokens allocated for entity descriptions in local retrieval.",
+        description="Maximum total tokens budget for the entire query context (entities + relations + chunks + system prompt).",
+        ge=1,
    )

    conversation_history: Optional[List[Dict[str, Any]]] = Field(
--- a/lightrag/base.py
+++ b/lightrag/base.py
@ -70,16 +70,14 @@ class QueryParam:
    If None, keeps all chunks returned from initial retrieval.
    """

-    max_token_for_text_unit: int = int(os.getenv("MAX_TOKEN_TEXT_CHUNK", "6000"))
-    """Maximum number of tokens allowed for each retrieved text chunk."""
+    max_entity_tokens: int = int(os.getenv("MAX_ENTITY_TOKENS", "10000"))
+    """Maximum number of tokens allocated for entity context in unified token control system."""

-    max_token_for_global_context: int = int(
-        os.getenv("MAX_TOKEN_RELATION_DESC", "4000")
-    )
-    """Maximum number of tokens allocated for relationship descriptions in global retrieval."""
+    max_relation_tokens: int = int(os.getenv("MAX_RELATION_TOKENS", "10000"))
+    """Maximum number of tokens allocated for relationship context in unified token control system."""

-    max_token_for_local_context: int = int(os.getenv("MAX_TOKEN_ENTITY_DESC", "4000"))
-    """Maximum number of tokens allocated for entity descriptions in local retrieval."""
+    max_total_tokens: int = int(os.getenv("MAX_TOTAL_TOKENS", "32000"))
+    """Maximum total tokens budget for the entire query context (entities + relations + chunks + system prompt)."""

    hl_keywords: list[str] = field(default_factory=list)
    """List of high-level keywords to prioritize in retrieval."""
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@ -1569,7 +1569,9 @@ async def kg_query(

    tokenizer: Tokenizer = global_config["tokenizer"]
    len_of_prompts = len(tokenizer.encode(query + sys_prompt))
-    logger.debug(f"[kg_query]Prompt Tokens: {len_of_prompts}")
+    logger.debug(
+        f"[kg_query] Sending to LLM: {len_of_prompts:,} tokens (Query: {len(tokenizer.encode(query))}, System: {len(tokenizer.encode(sys_prompt))})"
+    )

    response = await use_model_func(
        query,
@ -1692,7 +1694,9 @@ async def extract_keywords_only(

    tokenizer: Tokenizer = global_config["tokenizer"]
    len_of_prompts = len(tokenizer.encode(kw_prompt))
-    logger.debug(f"[kg_query]Prompt Tokens: {len_of_prompts}")
+    logger.debug(
+        f"[extract_keywords] Sending to LLM: {len_of_prompts:,} tokens (Prompt: {len_of_prompts})"
+    )

    # 5. Call the LLM for keyword extraction
    if param.model_func:
@ -1864,7 +1868,7 @@ async def _build_query_context(

        # Combine entities and relations contexts
        entities_context = process_combine_contexts(
-            hl_entities_context, ll_entities_context
+            ll_entities_context, hl_entities_context
        )
        relations_context = process_combine_contexts(
            hl_relations_context, ll_relations_context
@ -1894,6 +1898,163 @@ async def _build_query_context(
        f"Final context: {len(entities_context)} entities, {len(relations_context)} relations, {len(text_units_context)} chunks"
    )

+    # Unified token control system - Apply precise token limits to entities and relations
+    tokenizer = text_chunks_db.global_config.get("tokenizer")
+    if tokenizer:
+        # Get new token limits from query_param (with fallback to global_config)
+        max_entity_tokens = getattr(
+            query_param,
+            "max_entity_tokens",
+            text_chunks_db.global_config.get("MAX_ENTITY_TOKENS", 8000),
+        )
+        max_relation_tokens = getattr(
+            query_param,
+            "max_relation_tokens",
+            text_chunks_db.global_config.get("MAX_RELATION_TOKENS", 6000),
+        )
+        max_total_tokens = getattr(
+            query_param,
+            "max_total_tokens",
+            text_chunks_db.global_config.get("MAX_TOTAL_TOKENS", 32000),
+        )
+
+        # Truncate entities based on complete JSON serialization
+        if entities_context:
+            original_entity_count = len(entities_context)
+            entities_context = truncate_list_by_token_size(
+                entities_context,
+                key=lambda x: json.dumps(x, ensure_ascii=False),
+                max_token_size=max_entity_tokens,
+                tokenizer=tokenizer,
+            )
+            if len(entities_context) < original_entity_count:
+                logger.debug(
+                    f"Truncated entities: {original_entity_count} -> {len(entities_context)} (entity max tokens: {max_entity_tokens})"
+                )
+
+        # Truncate relations based on complete JSON serialization
+        if relations_context:
+            original_relation_count = len(relations_context)
+            relations_context = truncate_list_by_token_size(
+                relations_context,
+                key=lambda x: json.dumps(x, ensure_ascii=False),
+                max_token_size=max_relation_tokens,
+                tokenizer=tokenizer,
+            )
+            if len(relations_context) < original_relation_count:
+                logger.debug(
+                    f"Truncated relations: {original_relation_count} -> {len(relations_context)} (relation max tokens: {max_relation_tokens})"
+                )
+
+        # Calculate dynamic token limit for text chunks
+        entities_str = json.dumps(entities_context, ensure_ascii=False)
+        relations_str = json.dumps(relations_context, ensure_ascii=False)
+
+        # Calculate base context tokens (entities + relations + template)
+        kg_context_template = """-----Entities(KG)-----
+
+```json
+{entities_str}
+```
+
+-----Relationships(KG)-----
+
+```json
+{relations_str}
+```
+
+-----Document Chunks(DC)-----
+
+```json
+[]
+```
+
+"""
+        kg_context = kg_context_template.format(
+            entities_str=entities_str, relations_str=relations_str
+        )
+        kg_context_tokens = len(tokenizer.encode(kg_context))
+
+        # Calculate actual system prompt overhead dynamically
+        # 1. Calculate conversation history tokens
+        history_context = ""
+        if query_param.conversation_history:
+            history_context = get_conversation_turns(
+                query_param.conversation_history, query_param.history_turns
+            )
+        history_tokens = (
+            len(tokenizer.encode(history_context)) if history_context else 0
+        )
+
+        # 2. Calculate system prompt template tokens (excluding context_data)
+        user_prompt = query_param.user_prompt if query_param.user_prompt else ""
+        response_type = (
+            query_param.response_type
+            if query_param.response_type
+            else "Multiple Paragraphs"
+        )
+
+        # Get the system prompt template from PROMPTS
+        sys_prompt_template = text_chunks_db.global_config.get(
+            "system_prompt_template", PROMPTS["rag_response"]
+        )
+
+        # Create a sample system prompt with placeholders filled (excluding context_data)
+        sample_sys_prompt = sys_prompt_template.format(
+            history=history_context,
+            context_data="",  # Empty for overhead calculation
+            response_type=response_type,
+            user_prompt=user_prompt,
+        )
+        sys_prompt_template_tokens = len(tokenizer.encode(sample_sys_prompt))
+
+        # Total system prompt overhead = template + query tokens
+        query_tokens = len(tokenizer.encode(query))
+        sys_prompt_overhead = sys_prompt_template_tokens + query_tokens
+
+        buffer_tokens = 100  # Safety buffer as requested
+
+        # Calculate available tokens for text chunks
+        used_tokens = kg_context_tokens + sys_prompt_overhead + buffer_tokens
+        available_chunk_tokens = max_total_tokens - used_tokens
+
+        logger.debug(
+            f"Token allocation - Total: {max_total_tokens}, History: {history_tokens}, SysPrompt: {sys_prompt_overhead}, KG: {kg_context_tokens}, Buffer: {buffer_tokens}, Available for chunks: {available_chunk_tokens}"
+        )
+
+        # Re-process chunks with dynamic token limit
+        if text_units_context:
+            # Create a temporary query_param copy with adjusted chunk token limit
+            temp_chunks = [
+                {"content": chunk["content"], "file_path": chunk["file_path"]}
+                for chunk in text_units_context
+            ]
+
+            # Apply token truncation to chunks using the dynamic limit
+            truncated_chunks = await process_chunks_unified(
+                query=query,
+                chunks=temp_chunks,
+                query_param=query_param,
+                global_config=text_chunks_db.global_config,
+                source_type="mixed",
+                chunk_token_limit=available_chunk_tokens,  # Pass dynamic limit
+            )
+
+            # Rebuild text_units_context with truncated chunks
+            text_units_context = []
+            for i, chunk in enumerate(truncated_chunks):
+                text_units_context.append(
+                    {
+                        "id": i + 1,
+                        "content": chunk["content"],
+                        "file_path": chunk.get("file_path", "unknown_source"),
+                    }
+                )
+
+            logger.debug(
+                f"Re-truncated chunks for dynamic token limit: {len(temp_chunks)} -> {len(text_units_context)} (chunk available tokens: {available_chunk_tokens})"
+            )
+
    # not necessary to use LLM to generate a response
    if not entities_context and not relations_context:
        return None
@ -1982,18 +2143,6 @@ async def _get_node_data(
        knowledge_graph_inst,
    )

-    tokenizer: Tokenizer = text_chunks_db.global_config.get("tokenizer")
-    len_node_datas = len(node_datas)
-    node_datas = truncate_list_by_token_size(
-        node_datas,
-        key=lambda x: x["description"] if x["description"] is not None else "",
-        max_token_size=query_param.max_token_for_local_context,
-        tokenizer=tokenizer,
-    )
-    logger.debug(
-        f"Truncate entities from {len_node_datas} to {len(node_datas)} (max tokens:{query_param.max_token_for_local_context})"
-    )
-
    logger.info(
        f"Local query: {len(node_datas)} entites, {len(use_relations)} relations, {len(use_text_units)} chunks"
    )
@ -2199,20 +2348,9 @@ async def _find_most_related_edges_from_entities(
            }
            all_edges_data.append(combined)

-    tokenizer: Tokenizer = knowledge_graph_inst.global_config.get("tokenizer")
    all_edges_data = sorted(
        all_edges_data, key=lambda x: (x["rank"], x["weight"]), reverse=True
    )
-    all_edges_data = truncate_list_by_token_size(
-        all_edges_data,
-        key=lambda x: x["description"] if x["description"] is not None else "",
-        max_token_size=query_param.max_token_for_global_context,
-        tokenizer=tokenizer,
-    )
-
-    logger.debug(
-        f"Truncate relations from {len(all_edges)} to {len(all_edges_data)} (max tokens:{query_param.max_token_for_global_context})"
-    )

    return all_edges_data

@ -2269,16 +2407,9 @@ async def _get_edge_data(
            }
            edge_datas.append(combined)

-    tokenizer: Tokenizer = text_chunks_db.global_config.get("tokenizer")
    edge_datas = sorted(
        edge_datas, key=lambda x: (x["rank"], x["weight"]), reverse=True
    )
-    edge_datas = truncate_list_by_token_size(
-        edge_datas,
-        key=lambda x: x["description"] if x["description"] is not None else "",
-        max_token_size=query_param.max_token_for_global_context,
-        tokenizer=tokenizer,
-    )
    use_entities, use_text_units = await asyncio.gather(
        _find_most_related_entities_from_relationships(
            edge_datas,
@ -2388,18 +2519,6 @@ async def _find_most_related_entities_from_relationships(
        combined = {**node, "entity_name": entity_name, "rank": degree}
        node_datas.append(combined)

-    tokenizer: Tokenizer = knowledge_graph_inst.global_config.get("tokenizer")
-    len_node_datas = len(node_datas)
-    node_datas = truncate_list_by_token_size(
-        node_datas,
-        key=lambda x: x["description"] if x["description"] is not None else "",
-        max_token_size=query_param.max_token_for_local_context,
-        tokenizer=tokenizer,
-    )
-    logger.debug(
-        f"Truncate entities from {len_node_datas} to {len(node_datas)} (max tokens:{query_param.max_token_for_local_context})"
-    )
-
    return node_datas


@ -2491,13 +2610,64 @@ async def naive_query(
    if chunks is None or len(chunks) == 0:
        return PROMPTS["fail_response"]

-    # Process chunks using unified processing
+    # Calculate dynamic token limit for chunks
+    # Get token limits from query_param (with fallback to global_config)
+    max_total_tokens = getattr(
+        query_param, "max_total_tokens", global_config.get("MAX_TOTAL_TOKENS", 32000)
+    )
+
+    # Calculate conversation history tokens
+    history_context = ""
+    if query_param.conversation_history:
+        history_context = get_conversation_turns(
+            query_param.conversation_history, query_param.history_turns
+        )
+    history_tokens = len(tokenizer.encode(history_context)) if history_context else 0
+
+    # Calculate system prompt template tokens (excluding content_data)
+    user_prompt = query_param.user_prompt if query_param.user_prompt else ""
+    response_type = (
+        query_param.response_type
+        if query_param.response_type
+        else "Multiple Paragraphs"
+    )
+
+    # Use the provided system prompt or default
+    sys_prompt_template = (
+        system_prompt if system_prompt else PROMPTS["naive_rag_response"]
+    )
+
+    # Create a sample system prompt with empty content_data to calculate overhead
+    sample_sys_prompt = sys_prompt_template.format(
+        content_data="",  # Empty for overhead calculation
+        response_type=response_type,
+        history=history_context,
+        user_prompt=user_prompt,
+    )
+    sys_prompt_template_tokens = len(tokenizer.encode(sample_sys_prompt))
+
+    # Total system prompt overhead = template + query tokens
+    query_tokens = len(tokenizer.encode(query))
+    sys_prompt_overhead = sys_prompt_template_tokens + query_tokens
+
+    buffer_tokens = 100  # Safety buffer
+
+    # Calculate available tokens for chunks
+    used_tokens = sys_prompt_overhead + buffer_tokens
+    available_chunk_tokens = max_total_tokens - used_tokens
+
+    logger.debug(
+        f"Naive query token allocation - Total: {max_total_tokens}, History: {history_tokens}, SysPrompt: {sys_prompt_overhead}, Buffer: {buffer_tokens}, Available for chunks: {available_chunk_tokens}"
+    )
+
+    # Process chunks using unified processing with dynamic token limit
    processed_chunks = await process_chunks_unified(
        query=query,
        chunks=chunks,
        query_param=query_param,
        global_config=global_config,
        source_type="vector",
+        chunk_token_limit=available_chunk_tokens,  # Pass dynamic limit
    )

    logger.info(f"Final context: {len(processed_chunks)} chunks")
@ -2548,7 +2718,9 @@ async def naive_query(
        return sys_prompt

    len_of_prompts = len(tokenizer.encode(query + sys_prompt))
-    logger.debug(f"[naive_query]Prompt Tokens: {len_of_prompts}")
+    logger.debug(
+        f"[naive_query] Sending to LLM: {len_of_prompts:,} tokens (Query: {len(tokenizer.encode(query))}, System: {len(tokenizer.encode(sys_prompt))})"
+    )

    response = await use_model_func(
        query,
@ -2672,7 +2844,9 @@ async def kg_query_with_keywords(

    tokenizer: Tokenizer = global_config["tokenizer"]
    len_of_prompts = len(tokenizer.encode(query + sys_prompt))
-    logger.debug(f"[kg_query_with_keywords]Prompt Tokens: {len_of_prompts}")
+    logger.debug(
+        f"[kg_query_with_keywords] Sending to LLM: {len_of_prompts:,} tokens (Query: {len(tokenizer.encode(query))}, System: {len(tokenizer.encode(sys_prompt))})"
+    )

    # 6. Generate response
    response = await use_model_func(
@ -2849,6 +3023,7 @@ async def process_chunks_unified(
    query_param: QueryParam,
    global_config: dict,
    source_type: str = "mixed",
+    chunk_token_limit: int = None,  # Add parameter for dynamic token limit
 ) -> list[dict]:
    """
    Unified processing for text chunks: deduplication, chunk_top_k limiting, reranking, and token truncation.
@ -2859,6 +3034,7 @@ async def process_chunks_unified(
        query_param: Query parameters containing configuration
        global_config: Global configuration dictionary
        source_type: Source type for logging ("vector", "entity", "relationship", "mixed")
+        chunk_token_limit: Dynamic token limit for chunks (if None, uses default)

    Returns:
        Processed and filtered list of text chunks
@ -2901,16 +3077,25 @@ async def process_chunks_unified(
    # 4. Token-based final truncation
    tokenizer = global_config.get("tokenizer")
    if tokenizer and unique_chunks:
+        # Set default chunk_token_limit if not provided
+        if chunk_token_limit is None:
+            # Get default from query_param or global_config
+            chunk_token_limit = getattr(
+                query_param,
+                "max_total_tokens",
+                global_config.get("MAX_TOTAL_TOKENS", 32000),
+            )
+
        original_count = len(unique_chunks)
        unique_chunks = truncate_list_by_token_size(
            unique_chunks,
            key=lambda x: x.get("content", ""),
-            max_token_size=query_param.max_token_for_text_unit,
+            max_token_size=chunk_token_limit,
            tokenizer=tokenizer,
        )
        logger.debug(
            f"Token truncation: {len(unique_chunks)} chunks from {original_count} "
-            f"(max tokens: {query_param.max_token_for_text_unit}, source: {source_type})"
+            f"(chunk available tokens: {chunk_token_limit}, source: {source_type})"
        )

    return unique_chunks
--- a/lightrag_webui/src/api/lightrag.ts
+++ b/lightrag_webui/src/api/lightrag.ts
@ -90,12 +90,16 @@ export type QueryRequest = {
  stream?: boolean
  /** Number of top items to retrieve. Represents entities in 'local' mode and relationships in 'global' mode. */
  top_k?: number
-  /** Maximum number of tokens allowed for each retrieved text chunk. */
-  max_token_for_text_unit?: number
-  /** Maximum number of tokens allocated for relationship descriptions in global retrieval. */
-  max_token_for_global_context?: number
-  /** Maximum number of tokens allocated for entity descriptions in local retrieval. */
-  max_token_for_local_context?: number
+  /** Maximum number of text chunks to retrieve and process. */
+  chunk_top_k?: number
+  /** Number of text chunks to keep after reranking. */
+  chunk_rerank_top_k?: number
+  /** Maximum number of tokens allocated for entity context in unified token control system. */
+  max_entity_tokens?: number
+  /** Maximum number of tokens allocated for relationship context in unified token control system. */
+  max_relation_tokens?: number
+  /** Maximum total tokens budget for the entire query context (entities + relations + chunks + system prompt). */
+  max_total_tokens?: number
  /**
   * Stores past conversation history to maintain context.
   * Format: [{"role": "user/assistant", "content": "message"}].
--- a/lightrag_webui/src/components/retrieval/QuerySettings.tsx
+++ b/lightrag_webui/src/components/retrieval/QuerySettings.tsx
@ -132,30 +132,81 @@ export default function QuerySettings() {
              </div>
            </>

+            {/* Chunk Top K */}
+            <>
+              <TooltipProvider>
+                <Tooltip>
+                  <TooltipTrigger asChild>
+                    <label htmlFor="chunk_top_k" className="ml-1 cursor-help">
+                      {t('retrievePanel.querySettings.chunkTopK')}
+                    </label>
+                  </TooltipTrigger>
+                  <TooltipContent side="left">
+                    <p>{t('retrievePanel.querySettings.chunkTopKTooltip')}</p>
+                  </TooltipContent>
+                </Tooltip>
+              </TooltipProvider>
+              <div>
+                <NumberInput
+                  id="chunk_top_k"
+                  stepper={1}
+                  value={querySettings.chunk_top_k}
+                  onValueChange={(v) => handleChange('chunk_top_k', v)}
+                  min={1}
+                  placeholder={t('retrievePanel.querySettings.chunkTopKPlaceholder')}
+                />
+              </div>
+            </>
+
+            {/* Chunk Rerank Top K */}
+            <>
+              <TooltipProvider>
+                <Tooltip>
+                  <TooltipTrigger asChild>
+                    <label htmlFor="chunk_rerank_top_k" className="ml-1 cursor-help">
+                      {t('retrievePanel.querySettings.chunkRerankTopK')}
+                    </label>
+                  </TooltipTrigger>
+                  <TooltipContent side="left">
+                    <p>{t('retrievePanel.querySettings.chunkRerankTopKTooltip')}</p>
+                  </TooltipContent>
+                </Tooltip>
+              </TooltipProvider>
+              <div>
+                <NumberInput
+                  id="chunk_rerank_top_k"
+                  stepper={1}
+                  value={querySettings.chunk_rerank_top_k}
+                  onValueChange={(v) => handleChange('chunk_rerank_top_k', v)}
+                  min={1}
+                  placeholder={t('retrievePanel.querySettings.chunkRerankTopKPlaceholder')}
+                />
+              </div>
+            </>
+
            {/* Max Tokens */}
            <>
              <>
                <TooltipProvider>
                  <Tooltip>
                    <TooltipTrigger asChild>
-                      <label htmlFor="max_token_for_text_unit" className="ml-1 cursor-help">
-                        {t('retrievePanel.querySettings.maxTokensTextUnit')}
+                      <label htmlFor="max_entity_tokens" className="ml-1 cursor-help">
+                        {t('retrievePanel.querySettings.maxEntityTokens')}
                      </label>
                    </TooltipTrigger>
                    <TooltipContent side="left">
-                      <p>{t('retrievePanel.querySettings.maxTokensTextUnitTooltip')}</p>
+                      <p>{t('retrievePanel.querySettings.maxEntityTokensTooltip')}</p>
                    </TooltipContent>
                  </Tooltip>
                </TooltipProvider>
                <div>
-                  {/* Removed sr-only label */}
                  <NumberInput
-                    id="max_token_for_text_unit"
+                    id="max_entity_tokens"
                    stepper={500}
-                    value={querySettings.max_token_for_text_unit}
-                    onValueChange={(v) => handleChange('max_token_for_text_unit', v)}
+                    value={querySettings.max_entity_tokens}
+                    onValueChange={(v) => handleChange('max_entity_tokens', v)}
                    min={1}
-                    placeholder={t('retrievePanel.querySettings.maxTokensTextUnit')}
+                    placeholder={t('retrievePanel.querySettings.maxEntityTokens')}
                  />
                </div>
              </>
@ -164,24 +215,23 @@ export default function QuerySettings() {
                <TooltipProvider>
                  <Tooltip>
                    <TooltipTrigger asChild>
-                      <label htmlFor="max_token_for_global_context" className="ml-1 cursor-help">
-                        {t('retrievePanel.querySettings.maxTokensGlobalContext')}
+                      <label htmlFor="max_relation_tokens" className="ml-1 cursor-help">
+                        {t('retrievePanel.querySettings.maxRelationTokens')}
                      </label>
                    </TooltipTrigger>
                    <TooltipContent side="left">
-                      <p>{t('retrievePanel.querySettings.maxTokensGlobalContextTooltip')}</p>
+                      <p>{t('retrievePanel.querySettings.maxRelationTokensTooltip')}</p>
                    </TooltipContent>
                  </Tooltip>
                </TooltipProvider>
                <div>
-                  {/* Removed sr-only label */}
                  <NumberInput
-                    id="max_token_for_global_context"
+                    id="max_relation_tokens"
                    stepper={500}
-                    value={querySettings.max_token_for_global_context}
-                    onValueChange={(v) => handleChange('max_token_for_global_context', v)}
+                    value={querySettings.max_relation_tokens}
+                    onValueChange={(v) => handleChange('max_relation_tokens', v)}
                    min={1}
-                    placeholder={t('retrievePanel.querySettings.maxTokensGlobalContext')}
+                    placeholder={t('retrievePanel.querySettings.maxRelationTokens')}
                  />
                </div>
              </>
@ -190,24 +240,23 @@ export default function QuerySettings() {
                <TooltipProvider>
                  <Tooltip>
                    <TooltipTrigger asChild>
-                      <label htmlFor="max_token_for_local_context" className="ml-1 cursor-help">
-                        {t('retrievePanel.querySettings.maxTokensLocalContext')}
+                      <label htmlFor="max_total_tokens" className="ml-1 cursor-help">
+                        {t('retrievePanel.querySettings.maxTotalTokens')}
                      </label>
                    </TooltipTrigger>
                    <TooltipContent side="left">
-                      <p>{t('retrievePanel.querySettings.maxTokensLocalContextTooltip')}</p>
+                      <p>{t('retrievePanel.querySettings.maxTotalTokensTooltip')}</p>
                    </TooltipContent>
                  </Tooltip>
                </TooltipProvider>
                <div>
-                  {/* Removed sr-only label */}
                  <NumberInput
-                    id="max_token_for_local_context"
-                    stepper={500}
-                    value={querySettings.max_token_for_local_context}
-                    onValueChange={(v) => handleChange('max_token_for_local_context', v)}
+                    id="max_total_tokens"
+                    stepper={1000}
+                    value={querySettings.max_total_tokens}
+                    onValueChange={(v) => handleChange('max_total_tokens', v)}
                    min={1}
-                    placeholder={t('retrievePanel.querySettings.maxTokensLocalContext')}
+                    placeholder={t('retrievePanel.querySettings.maxTotalTokens')}
                  />
                </div>
              </>
--- a/lightrag_webui/src/locales/ar.json
+++ b/lightrag_webui/src/locales/ar.json
@ -359,16 +359,22 @@
        "singleParagraph": "فقرة واحدة",
        "bulletPoints": "نقاط نقطية"
      },
-      "topK": "أعلى K نتائج",
-      "topKTooltip": "عدد العناصر العلوية للاسترجاع. يمثل الكيانات في وضع 'محلي' والعلاقات في وضع 'عالمي'",
-      "topKPlaceholder": "عدد النتائج",
-      "maxTokensTextUnit": "أقصى عدد من الرموز لوحدة النص",
-      "maxTokensTextUnitTooltip": "الحد الأقصى لعدد الرموز المسموح به لكل جزء نصي مسترجع",
-      "maxTokensGlobalContext": "أقصى عدد من الرموز للسياق العالمي",
-      "maxTokensGlobalContextTooltip": "الحد الأقصى لعدد الرموز المخصص لأوصاف العلاقات في الاسترجاع العالمي",
-      "maxTokensLocalContext": "أقصى عدد من الرموز للسياق المحلي",
-      "maxTokensLocalContextTooltip": "الحد الأقصى لعدد الرموز المخصص لأوصاف الكيانات في الاسترجاع المحلي",
-      "historyTurns": "دورات التاريخ",
+      "topK": "أعلى K",
+      "topKTooltip": "عدد العناصر العلوية للاسترداد. يمثل الكيانات في الوضع 'المحلي' والعلاقات في الوضع 'العالمي'.",
+      "topKPlaceholder": "أدخل قيمة أعلى k",
+      "chunkTopK": "أعلى K للقطع",
+      "chunkTopKTooltip": "العدد الأقصى لقطع النص المراد استردادها ومعالجتها.",
+      "chunkTopKPlaceholder": "أدخل قيمة أعلى k للقطع",
+      "chunkRerankTopK": "أعلى K لإعادة الترتيب",
+      "chunkRerankTopKTooltip": "عدد قطع النص المراد الاحتفاظ بها بعد إعادة الترتيب.",
+      "chunkRerankTopKPlaceholder": "أدخل قيمة أعلى k لإعادة الترتيب",
+      "maxEntityTokens": "الحد الأقصى لرموز الكيان",
+      "maxEntityTokensTooltip": "الحد الأقصى لعدد الرموز المخصصة لسياق الكيان في نظام التحكم الموحد في الرموز",
+      "maxRelationTokens": "الحد الأقصى لرموز العلاقة",
+      "maxRelationTokensTooltip": "الحد الأقصى لعدد الرموز المخصصة لسياق العلاقة في نظام التحكم الموحد في الرموز",
+      "maxTotalTokens": "إجمالي الحد الأقصى للرموز",
+      "maxTotalTokensTooltip": "الحد الأقصى الإجمالي لميزانية الرموز لسياق الاستعلام بالكامل (الكيانات + العلاقات + الأجزاء + موجه النظام)",
+      "historyTurns": "أدوار التاريخ",
      "historyTurnsTooltip": "عدد الدورات الكاملة للمحادثة (أزواج المستخدم-المساعد) التي يجب مراعاتها في سياق الرد",
      "historyTurnsPlaceholder": "عدد دورات التاريخ",
      "onlyNeedContext": "تحتاج فقط إلى السياق",
--- a/lightrag_webui/src/locales/en.json
+++ b/lightrag_webui/src/locales/en.json
@ -359,15 +359,21 @@
        "singleParagraph": "Single Paragraph",
        "bulletPoints": "Bullet Points"
      },
-      "topK": "Top K Results",
-      "topKTooltip": "Number of top items to retrieve. Represents entities in 'local' mode and relationships in 'global' mode",
-      "topKPlaceholder": "Number of results",
-      "maxTokensTextUnit": "Max Tokens for Text Unit",
-      "maxTokensTextUnitTooltip": "Maximum number of tokens allowed for each retrieved text chunk",
-      "maxTokensGlobalContext": "Max Tokens for Global Context",
-      "maxTokensGlobalContextTooltip": "Maximum number of tokens allocated for relationship descriptions in global retrieval",
-      "maxTokensLocalContext": "Max Tokens for Local Context",
-      "maxTokensLocalContextTooltip": "Maximum number of tokens allocated for entity descriptions in local retrieval",
+      "topK": "Top K",
+      "topKTooltip": "Number of top items to retrieve. Represents entities in 'local' mode and relationships in 'global' mode.",
+      "topKPlaceholder": "Enter top k value",
+      "chunkTopK": "Chunk Top K",
+      "chunkTopKTooltip": "Maximum number of text chunks to retrieve and process.",
+      "chunkTopKPlaceholder": "Enter chunk top k value",
+      "chunkRerankTopK": "Chunk Rerank Top K",
+      "chunkRerankTopKTooltip": "Number of text chunks to keep after reranking.",
+      "chunkRerankTopKPlaceholder": "Enter rerank top k value",
+      "maxEntityTokens": "Max Entity Tokens",
+      "maxEntityTokensTooltip": "Maximum number of tokens allocated for entity context in unified token control system",
+      "maxRelationTokens": "Max Relation Tokens",
+      "maxRelationTokensTooltip": "Maximum number of tokens allocated for relationship context in unified token control system",
+      "maxTotalTokens": "Max Total Tokens",
+      "maxTotalTokensTooltip": "Maximum total tokens budget for the entire query context (entities + relations + chunks + system prompt)",
      "historyTurns": "History Turns",
      "historyTurnsTooltip": "Number of complete conversation turns (user-assistant pairs) to consider in the response context",
      "historyTurnsPlaceholder": "Number of history turns",
--- a/lightrag_webui/src/locales/fr.json
+++ b/lightrag_webui/src/locales/fr.json
@ -359,15 +359,21 @@
        "singleParagraph": "Paragraphe unique",
        "bulletPoints": "Points à puces"
      },
-      "topK": "Top K résultats",
-      "topKTooltip": "Nombre d'éléments supérieurs à récupérer. Représente les entités en mode 'local' et les relations en mode 'global'",
-      "topKPlaceholder": "Nombre de résultats",
-      "maxTokensTextUnit": "Nombre maximum de jetons pour l'unité de texte",
-      "maxTokensTextUnitTooltip": "Nombre maximum de jetons autorisés pour chaque fragment de texte récupéré",
-      "maxTokensGlobalContext": "Nombre maximum de jetons pour le contexte global",
-      "maxTokensGlobalContextTooltip": "Nombre maximum de jetons alloués pour les descriptions des relations dans la récupération globale",
-      "maxTokensLocalContext": "Nombre maximum de jetons pour le contexte local",
-      "maxTokensLocalContextTooltip": "Nombre maximum de jetons alloués pour les descriptions des entités dans la récupération locale",
+      "topK": "Top K",
+      "topKTooltip": "Nombre d'éléments principaux à récupérer. Représente les entités en mode 'local' et les relations en mode 'global'.",
+      "topKPlaceholder": "Entrez la valeur top k",
+      "chunkTopK": "Top K des Chunks",
+      "chunkTopKTooltip": "Nombre maximum de chunks de texte à récupérer et traiter.",
+      "chunkTopKPlaceholder": "Entrez la valeur top k des chunks",
+      "chunkRerankTopK": "Top K du Reclassement",
+      "chunkRerankTopKTooltip": "Nombre de chunks de texte à conserver après reclassement.",
+      "chunkRerankTopKPlaceholder": "Entrez la valeur top k du reclassement",
+      "maxEntityTokens": "Limite de jetons d'entité",
+      "maxEntityTokensTooltip": "Nombre maximum de jetons alloués au contexte d'entité dans le système de contrôle de jetons unifié",
+      "maxRelationTokens": "Limite de jetons de relation",
+      "maxRelationTokensTooltip": "Nombre maximum de jetons alloués au contexte de relation dans le système de contrôle de jetons unifié",
+      "maxTotalTokens": "Limite totale de jetons",
+      "maxTotalTokensTooltip": "Budget total maximum de jetons pour l'ensemble du contexte de requête (entités + relations + blocs + prompt système)",
      "historyTurns": "Tours d'historique",
      "historyTurnsTooltip": "Nombre de tours complets de conversation (paires utilisateur-assistant) à prendre en compte dans le contexte de la réponse",
      "historyTurnsPlaceholder": "Nombre de tours d'historique",
--- a/lightrag_webui/src/locales/zh.json
+++ b/lightrag_webui/src/locales/zh.json
@ -359,15 +359,21 @@
        "singleParagraph": "单段落",
        "bulletPoints": "要点"
      },
-      "topK": "Top K结果",
-      "topKTooltip": "检索的顶部项目数。在'local'模式下表示实体，在'global'模式下表示关系",
-      "topKPlaceholder": "结果数量",
-      "maxTokensTextUnit": "文本单元最大令牌数",
-      "maxTokensTextUnitTooltip": "每个检索文本块允许的最大令牌数",
-      "maxTokensGlobalContext": "全局上下文最大令牌数",
-      "maxTokensGlobalContextTooltip": "全局检索中关系描述的最大令牌数",
-      "maxTokensLocalContext": "本地上下文最大令牌数",
-      "maxTokensLocalContextTooltip": "本地检索中实体描述的最大令牌数",
+      "topK": "Top K",
+      "topKTooltip": "检索的顶部条目数量。在'local'模式下表示实体，在'global'模式下表示关系。",
+      "topKPlaceholder": "输入top k值",
+      "chunkTopK": "文本块 Top K",
+      "chunkTopKTooltip": "检索和处理的最大文本块数量。",
+      "chunkTopKPlaceholder": "输入文本块top k值",
+      "chunkRerankTopK": "重排序 Top K",
+      "chunkRerankTopKTooltip": "重排序后保留的文本块数量。",
+      "chunkRerankTopKPlaceholder": "输入重排序top k值",
+      "maxEntityTokens": "实体令牌数上限",
+      "maxEntityTokensTooltip": "统一令牌控制系统中分配给实体上下文的最大令牌数",
+      "maxRelationTokens": "关系令牌数上限",
+      "maxRelationTokensTooltip": "统一令牌控制系统中分配给关系上下文的最大令牌数",
+      "maxTotalTokens": "总令牌数上限",
+      "maxTotalTokensTooltip": "整个查询上下文的最大总令牌预算（实体+关系+文档块+系统提示）",
      "historyTurns": "历史轮次",
      "historyTurnsTooltip": "响应上下文中考虑的完整对话轮次（用户-助手对）数量",
      "historyTurnsPlaceholder": "历史轮次数",
--- a/lightrag_webui/src/locales/zh_TW.json
+++ b/lightrag_webui/src/locales/zh_TW.json
@ -300,7 +300,7 @@
          "file_path": "來源",
          "keywords": "Keys",
          "weight": "權重"
-          }
+        }
      },
      "edge": {
        "title": "關係",
@ -359,15 +359,15 @@
        "singleParagraph": "單段落",
        "bulletPoints": "重點"
      },
-      "topK": "Top K結果",
-      "topKTooltip": "檢索的前幾項結果數。在'local'模式下表示實體，在'global'模式下表示關係",
-      "topKPlaceholder": "結果數量",
-      "maxTokensTextUnit": "文字單元最大權杖數",
-      "maxTokensTextUnitTooltip": "每個檢索文字區塊允許的最大權杖數",
-      "maxTokensGlobalContext": "全域上下文最大權杖數",
-      "maxTokensGlobalContextTooltip": "全域檢索中關係描述的最大權杖數",
-      "maxTokensLocalContext": "本地上下文最大權杖數",
-      "maxTokensLocalContextTooltip": "本地檢索中實體描述的最大權杖數",
+      "topK": "Top K",
+      "topKTooltip": "檢索的頂部條目數量。在'local'模式下表示實體，在'global'模式下表示關係。",
+      "topKPlaceholder": "輸入top k值",
+      "chunkTopK": "文字區塊 Top K",
+      "chunkTopKTooltip": "檢索和處理的最大文字區塊數量。",
+      "chunkTopKPlaceholder": "輸入文字區塊top k值",
+      "chunkRerankTopK": "重新排序 Top K",
+      "chunkRerankTopKTooltip": "重新排序後保留的文字區塊數量。",
+      "chunkRerankTopKPlaceholder": "輸入重新排序top k值",
      "historyTurns": "歷史輪次",
      "historyTurnsTooltip": "回應上下文中考慮的完整對話輪次（使用者-助手對）數量",
      "historyTurnsPlaceholder": "歷史輪次數",
@ -379,7 +379,13 @@
      "streamResponseTooltip": "如果為True，啟用即時串流輸出回應",
      "userPrompt": "用戶提示詞",
      "userPromptTooltip": "向LLM提供額外的響應要求（與查詢內容無關，僅用於處理輸出）。",
-      "userPromptPlaceholder": "輸入自定義提示詞（可選）"
+      "userPromptPlaceholder": "輸入自定義提示詞（可選）",
+      "maxEntityTokens": "實體令牌數上限",
+      "maxEntityTokensTooltip": "統一令牌控制系統中分配給實體上下文的最大令牌數",
+      "maxRelationTokens": "關係令牌數上限",
+      "maxRelationTokensTooltip": "統一令牌控制系統中分配給關係上下文的最大令牌數",
+      "maxTotalTokens": "總令牌數上限",
+      "maxTotalTokensTooltip": "整個查詢上下文的最大總令牌預算（實體+關係+文檔塊+系統提示）"
    }
  },
  "apiSite": {
--- a/lightrag_webui/src/stores/settings.ts
+++ b/lightrag_webui/src/stores/settings.ts
@ -111,9 +111,11 @@ const useSettingsStoreBase = create<SettingsState>()(
        mode: 'global',
        response_type: 'Multiple Paragraphs',
        top_k: 10,
-        max_token_for_text_unit: 6000,
-        max_token_for_global_context: 4000,
-        max_token_for_local_context: 4000,
+        chunk_top_k: 5,
+        chunk_rerank_top_k: 5,
+        max_entity_tokens: 10000,
+        max_relation_tokens: 10000,
+        max_total_tokens: 32000,
        only_need_context: false,
        only_need_prompt: false,
        stream: true,
@ -192,7 +194,7 @@ const useSettingsStoreBase = create<SettingsState>()(
    {
      name: 'settings-storage',
      storage: createJSONStorage(() => localStorage),
-      version: 14,
+      version: 15,
      migrate: (state: any, version: number) => {
        if (version < 2) {
          state.showEdgeLabel = false
@ -215,9 +217,9 @@ const useSettingsStoreBase = create<SettingsState>()(
            mode: 'global',
            response_type: 'Multiple Paragraphs',
            top_k: 10,
-            max_token_for_text_unit: 4000,
-            max_token_for_global_context: 4000,
-            max_token_for_local_context: 4000,
+            max_entity_tokens: 10000,
+            max_relation_tokens: 10000,
+            max_total_tokens: 32000,
            only_need_context: false,
            only_need_prompt: false,
            stream: true,
@ -260,6 +262,26 @@ const useSettingsStoreBase = create<SettingsState>()(
          // Add backendMaxGraphNodes field for older versions
          state.backendMaxGraphNodes = null
        }
+        if (version < 15) {
+          // 完整更新querySettings到统一token控制系统
+          state.querySettings = {
+            mode: 'global',
+            response_type: 'Multiple Paragraphs',
+            top_k: 10,
+            chunk_top_k: 5,
+            chunk_rerank_top_k: 5,
+            max_entity_tokens: 10000,
+            max_relation_tokens: 10000,
+            max_total_tokens: 32000,
+            only_need_context: false,
+            only_need_prompt: false,
+            stream: true,
+            history_turns: 3,
+            hl_keywords: [],
+            ll_keywords: [],
+            user_prompt: ''
+          }
+        }
        return state
      }
    }