chore: add citation system and enhance RAG UI components

Add citation tracking and display system across backend and frontend components. Backend changes include citation.py for document attribution, enhanced query routes with citation metadata, improved prompt templates, and PostgreSQL schema updates. Frontend includes CitationMarker component, HoverCard UI, QuerySettings refinements, and ChatMessage enhancements for displaying document sources. Update dependencies and docker-compose test configuration for improved development workflow.
2025-12-01 17:50:00 +01:00 · 2025-12-01 17:50:00 +01:00 · 663ada943a
commit 663ada943a
parent 77df910525
23 changed files with 12102 additions and 260 deletions
--- a/docker-compose.test.yml
+++ b/docker-compose.test.yml
@ -54,13 +54,14 @@ services:
    volumes:
      - ./data/rag_storage_test:/app/data/rag_storage
      - ./data/inputs_test:/app/data/inputs
+      - ./lightrag:/app/lightrag  # Mount source for live reload
    environment:
      # Server
      - HOST=0.0.0.0
      - PORT=9621
      - LOG_LEVEL=DEBUG

-      # LLM (OpenAI)
+      # LLM (OpenAI - gpt-4o-mini for reliable fast extraction)
      - LLM_BINDING=openai
      - LLM_MODEL=gpt-4o-mini
      - LLM_BINDING_HOST=https://api.openai.com/v1
@ -85,8 +86,8 @@ services:
      - POSTGRES_PASSWORD=lightrag_pass
      - POSTGRES_DATABASE=lightrag

-      # Entity Resolution - ENABLED!
-      - ENTITY_RESOLUTION_ENABLED=true
+      # Entity Resolution - DISABLED for faster ingestion (testing Context Precision changes)
+      - ENTITY_RESOLUTION_ENABLED=false
      - ENTITY_RESOLUTION_FUZZY_THRESHOLD=0.85
      - ENTITY_RESOLUTION_VECTOR_THRESHOLD=0.5
      - ENTITY_RESOLUTION_MAX_CANDIDATES=3
@ -94,20 +95,19 @@ services:
      # Orphan Connection - MANUAL (use UI button instead of auto)
      - AUTO_CONNECT_ORPHANS=false

-      # Processing - Aggressive settings from agent-sdk
+      # Processing - Matching agent-sdk working settings
      - MAX_ASYNC=96
      - MAX_PARALLEL_INSERT=10
-      - EMBEDDING_FUNC_MAX_ASYNC=16
+      - EMBEDDING_FUNC_MAX_ASYNC=2             # Match llamacpp parallel slots (prevent queue backlog)
      - EMBEDDING_BATCH_NUM=48

      # Gunicorn - 8 workers x 4 threads = 32 concurrent handlers
      - GUNICORN_CMD_ARGS=--workers=8 --worker-class=gthread --threads=4 --worker-connections=1000 --timeout=120 --keep-alive=5 --graceful-timeout=30

-      # Extraction Optimization - Reduce Orphan Nodes
-      - CHUNK_SIZE=800                         # Smaller chunks for focused extraction
-      - CHUNK_OVERLAP_SIZE=400                 # 50% overlap captures cross-boundary relationships
-      - MAX_GLEANING=1                         # Enable gleaning refinement pass
-      - FORCE_LLM_SUMMARY_ON_MERGE=4           # More aggressive entity consolidation
+      # Extraction - Using agent-sdk defaults for reliable ingestion
+      - CHUNK_SIZE=1200                        # Default chunk size (agent-sdk default)
+      - CHUNK_OVERLAP_SIZE=100                 # Default overlap
+      # MAX_GLEANING defaults to 1 (removed override of 2)

      # Orphan Connection - Use UI button for manual triggering
      # AUTO_CONNECT_ORPHANS is set to false above (manual mode)
--- a/lightrag/api/routers/ollama_api.py
+++ b/lightrag/api/routers/ollama_api.py
@ -9,6 +9,7 @@ from enum import Enum
 from fastapi.responses import StreamingResponse
 import asyncio
 from lightrag import LightRAG, QueryParam
+from lightrag.constants import DEFAULT_TOP_K
 from lightrag.utils import TiktokenTokenizer
 from lightrag.api.utils_api import get_combined_auth_dependency
 from fastapi import Depends
@ -218,7 +219,7 @@ def parse_query_mode(query: str) -> tuple[str, SearchMode, bool, Optional[str]]:


 class OllamaAPI:
-    def __init__(self, rag: LightRAG, top_k: int = 60, api_key: Optional[str] = None):
+    def __init__(self, rag: LightRAG, top_k: int = DEFAULT_TOP_K, api_key: Optional[str] = None):
        self.rag = rag
        self.ollama_server_infos = rag.ollama_server_infos
        self.top_k = top_k
--- a/lightrag/api/routers/query_routes.py
+++ b/lightrag/api/routers/query_routes.py
@ -3,15 +3,76 @@ This module contains all query-related routes for the LightRAG API.
 """

 import json
+import re
 from typing import Any, Dict, List, Literal, Optional
 from fastapi import APIRouter, Depends, HTTPException
 from lightrag.base import QueryParam
+from lightrag.constants import DEFAULT_TOP_K
 from lightrag.api.utils_api import get_combined_auth_dependency
 from lightrag.utils import logger
 from pydantic import BaseModel, Field, field_validator

 router = APIRouter(tags=["query"])

+# Pattern to match reasoning tags like <think>...</think>
+REASONING_TAG_PATTERN = re.compile(r"<think>.*?</think>", re.DOTALL)
+
+
+def strip_reasoning_tags(text: str) -> str:
+    """Strip LLM reasoning tags like <think>...</think> from response text."""
+    if not text:
+        return text
+    return REASONING_TAG_PATTERN.sub("", text).strip()
+
+
+async def filter_reasoning_stream(response_stream):
+    """Filter <think>...</think> blocks from streaming response in real-time.
+
+    This is a state machine that buffers chunks and filters out reasoning blocks
+    as they stream in, preventing <think> tags from appearing to the user.
+    """
+    buffer = ""
+    in_think_block = False
+
+    async for chunk in response_stream:
+        buffer += chunk
+
+        while buffer:
+            if in_think_block:
+                # Look for </think> to exit reasoning block
+                end_idx = buffer.find("</think>")
+                if end_idx != -1:
+                    buffer = buffer[end_idx + 8:]  # Skip past </think>
+                    in_think_block = False
+                else:
+                    break  # Need more data to find closing tag
+            else:
+                # Look for <think> to enter reasoning block
+                start_idx = buffer.find("<think>")
+                if start_idx != -1:
+                    # Emit everything before <think>
+                    if start_idx > 0:
+                        yield buffer[:start_idx]
+                    buffer = buffer[start_idx + 7:]  # Skip past <think>
+                    in_think_block = True
+                else:
+                    # Check for partial "<think>" match at buffer end
+                    # This prevents emitting incomplete tags
+                    for i in range(min(7, len(buffer)), 0, -1):
+                        if "<think>"[:i] == buffer[-i:]:
+                            if len(buffer) > i:
+                                yield buffer[:-i]
+                            buffer = buffer[-i:]
+                            break
+                    else:
+                        yield buffer
+                        buffer = ""
+                    break
+
+    # Emit any remaining buffer (only if not inside a think block)
+    if buffer and not in_think_block:
+        yield buffer
+

 class QueryRequest(BaseModel):
    query: str = Field(
@ -110,6 +171,18 @@ class QueryRequest(BaseModel):
        description="If True, enables streaming output for real-time responses. Only affects /query/stream endpoint.",
    )

+    citation_mode: Optional[Literal["none", "inline", "footnotes"]] = Field(
+        default="none",
+        description="Citation extraction mode: 'none' (no post-processing), 'inline' (add [n] markers in text), 'footnotes' (add markers and formatted footnotes). When enabled, citations are computed asynchronously after response completes.",
+    )
+
+    citation_threshold: Optional[float] = Field(
+        default=0.7,
+        ge=0.0,
+        le=1.0,
+        description="Minimum similarity threshold for citation matching (0.0-1.0). Higher values mean stricter matching.",
+    )
+
    @field_validator("query", mode="after")
    @classmethod
    def query_strip_after(cls, query: str) -> str:
@ -134,7 +207,14 @@ class QueryRequest(BaseModel):
        # Use Pydantic's `.model_dump(exclude_none=True)` to remove None values automatically
        # Exclude API-level parameters that don't belong in QueryParam
        request_data = self.model_dump(
-            exclude_none=True, exclude={"query", "include_chunk_content"}
+            exclude_none=True,
+            exclude={
+                "query",
+                "include_chunk_content",
+                "include_references",
+                "citation_mode",
+                "citation_threshold",
+            },
        )

        # Ensure `mode` and `stream` are set explicitly
@ -190,7 +270,118 @@ class StreamChunkResponse(BaseModel):
    )


-def create_query_routes(rag, api_key: Optional[str] = None, top_k: int = 60):
+class CitationSpanModel(BaseModel):
+    """A span in the response with citation attribution."""
+
+    start_char: int = Field(description="Start character position in response")
+    end_char: int = Field(description="End character position in response")
+    text: str = Field(description="The text span being cited")
+    reference_ids: List[str] = Field(description="Reference IDs supporting this span")
+    confidence: float = Field(description="Citation confidence score (0.0-1.0)")
+
+
+class EnhancedReferenceItem(BaseModel):
+    """Enhanced reference with full metadata for footnotes."""
+
+    reference_id: str = Field(description="Unique reference identifier")
+    file_path: str = Field(description="Path to the source file")
+    document_title: Optional[str] = Field(
+        default=None, description="Human-readable document title"
+    )
+    section_title: Optional[str] = Field(
+        default=None, description="Section or chapter title"
+    )
+    page_range: Optional[str] = Field(default=None, description="Page range (e.g., pp. 45-67)")
+    excerpt: Optional[str] = Field(
+        default=None, description="Brief excerpt from the source"
+    )
+
+
+async def _extract_and_stream_citations(
+    response: str,
+    chunks: List[Dict[str, Any]],
+    references: List[Dict[str, str]],
+    rag,
+    min_similarity: float,
+    citation_mode: str,
+):
+    """Extract citations from response and yield NDJSON lines.
+
+    NEW PROTOCOL (eliminates duplicate payload):
+    - Does NOT send full annotated_response (that would duplicate the streamed response)
+    - Instead sends citation positions + metadata for frontend marker insertion
+    - Frontend uses character positions to insert [n] markers client-side
+
+    Args:
+        response: The full LLM response text
+        chunks: List of chunk dictionaries from retrieval
+        references: List of reference dicts
+        rag: The RAG instance (for embedding function)
+        min_similarity: Minimum similarity threshold
+        citation_mode: 'inline' or 'footnotes'
+
+    Yields:
+        NDJSON lines for citation metadata (no duplicate text)
+    """
+    try:
+        from lightrag.citation import extract_citations_from_response
+
+        # Extract citations using the citation module
+        citation_result = await extract_citations_from_response(
+            response=response,
+            chunks=chunks,
+            references=references,
+            embedding_func=rag.embedding_func,
+            min_similarity=min_similarity,
+        )
+
+        # Build citation markers with positions for frontend insertion
+        # Each marker tells frontend where to insert [n] without sending full text
+        citation_markers = []
+        for citation in citation_result.citations:
+            citation_markers.append({
+                "marker": "[" + ",".join(citation.reference_ids) + "]",
+                "insert_position": citation.end_char,  # Insert after sentence
+                "reference_ids": citation.reference_ids,
+                "confidence": citation.confidence,
+                "text_preview": citation.text[:50] + "..." if len(citation.text) > 50 else citation.text,
+            })
+
+        # Build enhanced sources with metadata
+        sources = []
+        for ref in citation_result.references:
+            sources.append({
+                "reference_id": ref.reference_id,
+                "file_path": ref.file_path,
+                "document_title": ref.document_title,
+                "section_title": ref.section_title,
+                "page_range": ref.page_range,
+                "excerpt": ref.excerpt,
+            })
+
+        # Format footnotes if requested
+        footnotes = citation_result.footnotes if citation_mode == "footnotes" else []
+
+        # Send single consolidated citations_metadata object
+        # Frontend uses this to insert markers without needing the full text again
+        yield json.dumps({
+            "citations_metadata": {
+                "markers": citation_markers,  # Position-based markers for insertion
+                "sources": sources,           # Enhanced reference metadata
+                "footnotes": footnotes,       # Pre-formatted footnote strings
+                "uncited_count": len(citation_result.uncited_claims),
+            }
+        }) + "\n"
+
+    except ImportError:
+        logger.warning("Citation module not available. Skipping citation extraction.")
+        yield json.dumps({"citation_error": "Citation module not available"}) + "\n"
+    except Exception as e:
+        logger.error(f"Citation extraction error: {str(e)}")
+        yield json.dumps({"citation_error": str(e)}) + "\n"
+
+
+def create_query_routes(rag, api_key: Optional[str] = None, top_k: int = DEFAULT_TOP_K):
    combined_auth = get_combined_auth_dependency(api_key)

    @router.post(
@ -421,6 +612,9 @@ def create_query_routes(rag, api_key: Optional[str] = None, top_k: int = 60):
            if not response_content:
                response_content = "No relevant context found for the query."

+            # Strip reasoning tags like <think>...</think>
+            response_content = strip_reasoning_tags(response_content)
+
            # Enrich references with chunk content if requested
            if request.include_references and request.include_chunk_content:
                chunks = data.get("chunks", [])
@ -672,12 +866,11 @@ def create_query_routes(rag, api_key: Optional[str] = None, top_k: int = 60):
            async def stream_generator():
                # Extract references and LLM response from unified result
                references = result.get("data", {}).get("references", [])
+                chunks = result.get("data", {}).get("chunks", [])
                llm_response = result.get("llm_response", {})

                # Enrich references with chunk content if requested
                if request.include_references and request.include_chunk_content:
-                    data = result.get("data", {})
-                    chunks = data.get("chunks", [])
                    # Create a mapping from reference_id to chunk content
                    ref_id_to_content = {}
                    for chunk in chunks:
@ -698,6 +891,10 @@ def create_query_routes(rag, api_key: Optional[str] = None, top_k: int = 60):
                        enriched_references.append(ref_copy)
                    references = enriched_references

+                # Track collected response for citation extraction
+                collected_response = []
+                citation_mode = request.citation_mode or "none"
+
                if llm_response.get("is_streaming"):
                    # Streaming mode: send references first, then stream response chunks
                    if request.include_references:
@ -706,18 +903,36 @@ def create_query_routes(rag, api_key: Optional[str] = None, top_k: int = 60):
                    response_stream = llm_response.get("response_iterator")
                    if response_stream:
                        try:
-                            async for chunk in response_stream:
+                            # Filter <think>...</think> blocks in real-time
+                            async for chunk in filter_reasoning_stream(response_stream):
                                if chunk:  # Only send non-empty content
                                    yield f"{json.dumps({'response': chunk})}\n"
+                                    collected_response.append(chunk)
                        except Exception as e:
                            logger.error(f"Streaming error: {str(e)}")
                            yield f"{json.dumps({'error': str(e)})}\n"
+
+                    # After streaming completes, extract citations if enabled
+                    if citation_mode in ["inline", "footnotes"] and collected_response:
+                        full_response = strip_reasoning_tags("".join(collected_response))
+                        async for line in _extract_and_stream_citations(
+                            full_response,
+                            chunks,
+                            references,
+                            rag,
+                            request.citation_threshold or 0.7,
+                            citation_mode,
+                        ):
+                            yield line
                else:
                    # Non-streaming mode: send complete response in one message
                    response_content = llm_response.get("content", "")
                    if not response_content:
                        response_content = "No relevant context found for the query."

+                    # Strip reasoning tags like <think>...</think>
+                    response_content = strip_reasoning_tags(response_content)
+
                    # Create complete response object
                    complete_response = {"response": response_content}
                    if request.include_references:
@ -725,6 +940,18 @@ def create_query_routes(rag, api_key: Optional[str] = None, top_k: int = 60):

                    yield f"{json.dumps(complete_response)}\n"

+                    # Extract citations for non-streaming mode too
+                    if citation_mode in ["inline", "footnotes"] and response_content:
+                        async for line in _extract_and_stream_citations(
+                            response_content,
+                            chunks,
+                            references,
+                            rag,
+                            request.citation_threshold or 0.7,
+                            citation_mode,
+                        ):
+                            yield line
+
            return StreamingResponse(
                stream_generator(),
                media_type="application/x-ndjson",
--- a/lightrag/citation.py
+++ b/lightrag/citation.py
@ -18,7 +18,7 @@ import numpy as np
 logger = logging.getLogger(__name__)

 # Configuration
-CITATION_MIN_SIMILARITY = float(os.getenv("CITATION_MIN_SIMILARITY", "0.7"))
+CITATION_MIN_SIMILARITY = float(os.getenv("CITATION_MIN_SIMILARITY", "0.5"))
 CITATION_MAX_PER_SENTENCE = int(os.getenv("CITATION_MAX_PER_SENTENCE", "3"))


--- a/lightrag/constants.py
+++ b/lightrag/constants.py
@ -47,8 +47,8 @@ DEFAULT_CHUNK_TOP_K = 20
 DEFAULT_MAX_ENTITY_TOKENS = 6000
 DEFAULT_MAX_RELATION_TOKENS = 8000
 DEFAULT_MAX_TOTAL_TOKENS = 30000
-DEFAULT_COSINE_THRESHOLD = 0.2
-DEFAULT_RELATED_CHUNK_NUMBER = 5
+DEFAULT_COSINE_THRESHOLD = 0.40  # Balanced: 0.35 too permissive, 0.45 breaks local mode
+DEFAULT_RELATED_CHUNK_NUMBER = 8  # Increased from 5 for better context coverage
 DEFAULT_KG_CHUNK_PICK_METHOD = "VECTOR"

 # TODO: Deprated. All conversation_history messages is send to LLM.
--- a/lightrag/evaluation/eval_rag_quality.py
+++ b/lightrag/evaluation/eval_rag_quality.py
@ -115,7 +115,7 @@ def _is_nan(value: Any) -> bool:
 class RAGEvaluator:
    """Evaluate RAG system quality using RAGAS metrics"""

-    def __init__(self, test_dataset_path: str = None, rag_api_url: str = None):
+    def __init__(self, test_dataset_path: str = None, rag_api_url: str = None, query_mode: str = "mix"):
        """
        Initialize evaluator with test dataset

@ -123,6 +123,7 @@ class RAGEvaluator:
            test_dataset_path: Path to test dataset JSON file
            rag_api_url: Base URL of LightRAG API (e.g., http://localhost:9621)
                        If None, will try to read from environment or use default
+            query_mode: Query mode for retrieval (local, global, hybrid, mix, naive)

        Environment Variables:
            EVAL_LLM_MODEL: LLM model for evaluation (default: gpt-4o-mini)
@ -219,6 +220,7 @@ class RAGEvaluator:

        self.test_dataset_path = Path(test_dataset_path)
        self.rag_api_url = rag_api_url.rstrip("/")
+        self.query_mode = query_mode
        self.results_dir = Path(__file__).parent / "results"
        self.results_dir.mkdir(exist_ok=True)

@ -275,6 +277,7 @@ class RAGEvaluator:
        logger.info("  • Total Test Cases:     %s", len(self.test_cases))
        logger.info("  • Test Dataset:         %s", self.test_dataset_path.name)
        logger.info("  • LightRAG API:         %s", self.rag_api_url)
+        logger.info("  • Query Mode:           %s", self.query_mode)
        logger.info("  • Results Directory:    %s", self.results_dir.name)

    def _load_test_dataset(self) -> List[Dict[str, str]]:
@ -309,7 +312,7 @@ class RAGEvaluator:
        try:
            payload = {
                "query": question,
-                "mode": "mix",
+                "mode": self.query_mode,
                "include_references": True,
                "include_chunk_content": True,  # NEW: Request chunk content in references
                "response_type": "Multiple Paragraphs",
@ -997,6 +1000,15 @@ Examples:
            help="LightRAG API endpoint URL (default: http://localhost:9621 or $LIGHTRAG_API_URL environment variable)",
        )

+        parser.add_argument(
+            "--mode",
+            "-m",
+            type=str,
+            default="mix",
+            choices=["local", "global", "hybrid", "mix", "naive"],
+            help="Query mode for retrieval (default: mix). 'local' for entity-specific questions, 'mix' for comprehensive retrieval.",
+        )
+
        args = parser.parse_args()

        logger.info("%s", "=" * 70)
@ -1004,7 +1016,7 @@ Examples:
        logger.info("%s", "=" * 70)

        evaluator = RAGEvaluator(
-            test_dataset_path=args.dataset, rag_api_url=args.ragendpoint
+            test_dataset_path=args.dataset, rag_api_url=args.ragendpoint, query_mode=args.mode
        )
        await evaluator.run()
    except Exception as e:
--- a/lightrag/kg/postgres_impl.py
+++ b/lightrag/kg/postgres_impl.py
@ -62,6 +62,7 @@ class PostgreSQLDB:
        self.database = config["database"]
        self.workspace = config["workspace"]
        self.max = int(config["max_connections"])
+        self.min = int(config.get("min_connections", 5))
        self.increment = 1
        self.pool: Pool | None = None

@ -200,7 +201,7 @@ class PostgreSQLDB:
            "database": self.database,
            "host": self.host,
            "port": self.port,
-            "min_size": 1,
+            "min_size": self.min,  # Configurable via POSTGRES_MIN_CONNECTIONS
            "max_size": self.max,
        }

@ -1184,6 +1185,28 @@ class PostgreSQLDB:
                ("idx_lightrag_doc_status_workspace_path", "LIGHTRAG_DOC_STATUS", "(workspace, file_path)"),
            ]

+            # GIN indexes for array membership queries (chunk_ids lookups)
+            gin_indexes = [
+                ("idx_lightrag_vdb_entity_chunk_ids_gin", "LIGHTRAG_VDB_ENTITY", "USING gin (chunk_ids)"),
+                ("idx_lightrag_vdb_relation_chunk_ids_gin", "LIGHTRAG_VDB_RELATION", "USING gin (chunk_ids)"),
+            ]
+
+            # Create GIN indexes separately (different syntax)
+            for index_name, table_name, index_type in gin_indexes:
+                if index_name not in existing_indexes:
+                    try:
+                        create_gin_sql = (
+                            f"CREATE INDEX IF NOT EXISTS {index_name} ON {table_name} {index_type}"
+                        )
+                        logger.info(
+                            f"PostgreSQL, Creating GIN index {index_name} on {table_name}"
+                        )
+                        await self.execute(create_gin_sql)
+                    except Exception as e:
+                        logger.warning(
+                            f"PostgreSQL, Failed to create GIN index {index_name}: {e}"
+                        )
+
            for index_name, table_name, columns in performance_indexes:
                if index_name not in existing_indexes:
                    try:
@ -1679,6 +1702,39 @@ class PostgreSQLDB:
            logger.error(f"PostgreSQL database,\nsql:{sql},\ndata:{data},\nerror:{e}")
            raise

+    async def executemany(
+        self,
+        sql: str,
+        data_list: list[tuple],
+        batch_size: int = 500,
+    ) -> None:
+        """Execute SQL with multiple parameter sets using asyncpg's executemany.
+
+        This is significantly faster than calling execute() in a loop because it
+        reduces database round-trips by batching multiple rows in a single operation.
+
+        Args:
+            sql: The SQL statement with positional parameters ($1, $2, etc.)
+            data_list: List of tuples, each containing parameters for one row
+            batch_size: Number of rows to process per batch (default 500)
+        """
+        if not data_list:
+            return
+
+        async def _operation(connection: asyncpg.Connection) -> None:
+            for i in range(0, len(data_list), batch_size):
+                batch = data_list[i : i + batch_size]
+                await connection.executemany(sql, batch)
+
+        try:
+            await self._run_with_retry(_operation)
+            logger.debug(
+                f"PostgreSQL executemany: inserted {len(data_list)} rows in batches of {batch_size}"
+            )
+        except Exception as e:
+            logger.error(f"PostgreSQL executemany error: {e}, sql: {sql[:100]}...")
+            raise
+

 class ClientManager:
    _instances: dict[str, Any] = {"db": None, "ref_count": 0}
@ -1712,9 +1768,17 @@ class ClientManager:
                "POSTGRES_WORKSPACE",
                config.get("postgres", "workspace", fallback=None),
            ),
-            "max_connections": os.environ.get(
-                "POSTGRES_MAX_CONNECTIONS",
-                config.get("postgres", "max_connections", fallback=50),
+            "max_connections": int(
+                os.environ.get(
+                    "POSTGRES_MAX_CONNECTIONS",
+                    config.get("postgres", "max_connections", fallback=50),
+                )
+            ),
+            "min_connections": int(
+                os.environ.get(
+                    "POSTGRES_MIN_CONNECTIONS",
+                    config.get("postgres", "min_connections", fallback=5),
+                )
            ),
            # SSL configuration
            "ssl_mode": os.environ.get(
@ -2161,108 +2225,117 @@ class PGKVStorage(BaseKVStorage):
        if not data:
            return

-        if is_namespace(self.namespace, NameSpace.KV_STORE_TEXT_CHUNKS):
-            # Get current UTC time and convert to naive datetime for database storage
-            current_time = datetime.datetime.now(timezone.utc).replace(tzinfo=None)
-            for k, v in data.items():
-                upsert_sql = SQL_TEMPLATES["upsert_text_chunk"]
-                _data = {
-                    "workspace": self.workspace,
-                    "id": k,
-                    "tokens": v["tokens"],
-                    "chunk_order_index": v["chunk_order_index"],
-                    "full_doc_id": v["full_doc_id"],
-                    "content": v["content"],
-                    "file_path": v["file_path"],
-                    "llm_cache_list": json.dumps(v.get("llm_cache_list", [])),
-                    "create_time": current_time,
-                    "update_time": current_time,
-                }
-                await self.db.execute(upsert_sql, _data)
-        elif is_namespace(self.namespace, NameSpace.KV_STORE_FULL_DOCS):
-            for k, v in data.items():
-                upsert_sql = SQL_TEMPLATES["upsert_doc_full"]
-                _data = {
-                    "id": k,
-                    "content": v["content"],
-                    "doc_name": v.get("file_path", ""),  # Map file_path to doc_name
-                    "workspace": self.workspace,
-                }
-                await self.db.execute(upsert_sql, _data)
-        elif is_namespace(self.namespace, NameSpace.KV_STORE_LLM_RESPONSE_CACHE):
-            for k, v in data.items():
-                upsert_sql = SQL_TEMPLATES["upsert_llm_response_cache"]
-                _data = {
-                    "workspace": self.workspace,
-                    "id": k,  # Use flattened key as id
-                    "original_prompt": v["original_prompt"],
-                    "return_value": v["return"],
-                    "chunk_id": v.get("chunk_id"),
-                    "cache_type": v.get(
-                        "cache_type", "extract"
-                    ),  # Get cache_type from data
-                    "queryparam": json.dumps(v.get("queryparam"))
-                    if v.get("queryparam")
-                    else None,
-                }
+        # Get current UTC time and convert to naive datetime for database storage
+        current_time = datetime.datetime.now(timezone.utc).replace(tzinfo=None)
+
+        if is_namespace(self.namespace, NameSpace.KV_STORE_TEXT_CHUNKS):
+            upsert_sql = SQL_TEMPLATES["upsert_text_chunk"]
+            # Collect all rows as tuples for batch insert
+            batch_data = [
+                (
+                    self.workspace,
+                    k,
+                    v["tokens"],
+                    v["chunk_order_index"],
+                    v["full_doc_id"],
+                    v["content"],
+                    v["file_path"],
+                    json.dumps(v.get("llm_cache_list", [])),
+                    current_time,
+                    current_time,
+                )
+                for k, v in data.items()
+            ]
+            await self.db.executemany(upsert_sql, batch_data)
+
+        elif is_namespace(self.namespace, NameSpace.KV_STORE_FULL_DOCS):
+            upsert_sql = SQL_TEMPLATES["upsert_doc_full"]
+            batch_data = [
+                (
+                    k,
+                    v["content"],
+                    v.get("file_path", ""),  # Map file_path to doc_name
+                    self.workspace,
+                )
+                for k, v in data.items()
+            ]
+            await self.db.executemany(upsert_sql, batch_data)
+
+        elif is_namespace(self.namespace, NameSpace.KV_STORE_LLM_RESPONSE_CACHE):
+            upsert_sql = SQL_TEMPLATES["upsert_llm_response_cache"]
+            batch_data = [
+                (
+                    self.workspace,
+                    k,  # Use flattened key as id
+                    v["original_prompt"],
+                    v["return"],
+                    v.get("chunk_id"),
+                    v.get("cache_type", "extract"),
+                    json.dumps(v.get("queryparam")) if v.get("queryparam") else None,
+                )
+                for k, v in data.items()
+            ]
+            await self.db.executemany(upsert_sql, batch_data)

-                await self.db.execute(upsert_sql, _data)
        elif is_namespace(self.namespace, NameSpace.KV_STORE_FULL_ENTITIES):
-            # Get current UTC time and convert to naive datetime for database storage
-            current_time = datetime.datetime.now(timezone.utc).replace(tzinfo=None)
-            for k, v in data.items():
-                upsert_sql = SQL_TEMPLATES["upsert_full_entities"]
-                _data = {
-                    "workspace": self.workspace,
-                    "id": k,
-                    "entity_names": json.dumps(v["entity_names"]),
-                    "count": v["count"],
-                    "create_time": current_time,
-                    "update_time": current_time,
-                }
-                await self.db.execute(upsert_sql, _data)
+            upsert_sql = SQL_TEMPLATES["upsert_full_entities"]
+            batch_data = [
+                (
+                    self.workspace,
+                    k,
+                    json.dumps(v["entity_names"]),
+                    v["count"],
+                    current_time,
+                    current_time,
+                )
+                for k, v in data.items()
+            ]
+            await self.db.executemany(upsert_sql, batch_data)
+
        elif is_namespace(self.namespace, NameSpace.KV_STORE_FULL_RELATIONS):
-            # Get current UTC time and convert to naive datetime for database storage
-            current_time = datetime.datetime.now(timezone.utc).replace(tzinfo=None)
-            for k, v in data.items():
-                upsert_sql = SQL_TEMPLATES["upsert_full_relations"]
-                _data = {
-                    "workspace": self.workspace,
-                    "id": k,
-                    "relation_pairs": json.dumps(v["relation_pairs"]),
-                    "count": v["count"],
-                    "create_time": current_time,
-                    "update_time": current_time,
-                }
-                await self.db.execute(upsert_sql, _data)
+            upsert_sql = SQL_TEMPLATES["upsert_full_relations"]
+            batch_data = [
+                (
+                    self.workspace,
+                    k,
+                    json.dumps(v["relation_pairs"]),
+                    v["count"],
+                    current_time,
+                    current_time,
+                )
+                for k, v in data.items()
+            ]
+            await self.db.executemany(upsert_sql, batch_data)
+
        elif is_namespace(self.namespace, NameSpace.KV_STORE_ENTITY_CHUNKS):
-            # Get current UTC time and convert to naive datetime for database storage
-            current_time = datetime.datetime.now(timezone.utc).replace(tzinfo=None)
-            for k, v in data.items():
-                upsert_sql = SQL_TEMPLATES["upsert_entity_chunks"]
-                _data = {
-                    "workspace": self.workspace,
-                    "id": k,
-                    "chunk_ids": json.dumps(v["chunk_ids"]),
-                    "count": v["count"],
-                    "create_time": current_time,
-                    "update_time": current_time,
-                }
-                await self.db.execute(upsert_sql, _data)
+            upsert_sql = SQL_TEMPLATES["upsert_entity_chunks"]
+            batch_data = [
+                (
+                    self.workspace,
+                    k,
+                    json.dumps(v["chunk_ids"]),
+                    v["count"],
+                    current_time,
+                    current_time,
+                )
+                for k, v in data.items()
+            ]
+            await self.db.executemany(upsert_sql, batch_data)
+
        elif is_namespace(self.namespace, NameSpace.KV_STORE_RELATION_CHUNKS):
-            # Get current UTC time and convert to naive datetime for database storage
-            current_time = datetime.datetime.now(timezone.utc).replace(tzinfo=None)
-            for k, v in data.items():
-                upsert_sql = SQL_TEMPLATES["upsert_relation_chunks"]
-                _data = {
-                    "workspace": self.workspace,
-                    "id": k,
-                    "chunk_ids": json.dumps(v["chunk_ids"]),
-                    "count": v["count"],
-                    "create_time": current_time,
-                    "update_time": current_time,
-                }
-                await self.db.execute(upsert_sql, _data)
+            upsert_sql = SQL_TEMPLATES["upsert_relation_chunks"]
+            batch_data = [
+                (
+                    self.workspace,
+                    k,
+                    json.dumps(v["chunk_ids"]),
+                    v["count"],
+                    current_time,
+                    current_time,
+                )
+                for k, v in data.items()
+            ]
+            await self.db.executemany(upsert_sql, batch_data)

    async def index_done_callback(self) -> None:
        # PG handles persistence automatically
@ -2376,77 +2449,73 @@ class PGVectorStorage(BaseVectorStorage):
            await ClientManager.release_client(self.db)
            self.db = None

-    def _upsert_chunks(
+    def _prepare_chunk_tuple(
        self, item: dict[str, Any], current_time: datetime.datetime
-    ) -> tuple[str, dict[str, Any]]:
+    ) -> tuple:
+        """Prepare a tuple for batch chunk upsert."""
        try:
-            upsert_sql = SQL_TEMPLATES["upsert_chunk"]
-            data: dict[str, Any] = {
-                "workspace": self.workspace,
-                "id": item["__id__"],
-                "tokens": item["tokens"],
-                "chunk_order_index": item["chunk_order_index"],
-                "full_doc_id": item["full_doc_id"],
-                "content": item["content"],
-                "content_vector": json.dumps(item["__vector__"].tolist()),
-                "file_path": item["file_path"],
-                "create_time": current_time,
-                "update_time": current_time,
-            }
+            return (
+                self.workspace,
+                item["__id__"],
+                item["tokens"],
+                item["chunk_order_index"],
+                item["full_doc_id"],
+                item["content"],
+                json.dumps(item["__vector__"].tolist()),
+                item["file_path"],
+                current_time,
+                current_time,
+            )
        except Exception as e:
            logger.error(
-                f"[{self.workspace}] Error to prepare upsert,\nsql: {e}\nitem: {item}"
+                f"[{self.workspace}] Error to prepare upsert,\nerror: {e}\nitem: {item}"
            )
            raise

-        return upsert_sql, data
-
-    def _upsert_entities(
+    def _prepare_entity_tuple(
        self, item: dict[str, Any], current_time: datetime.datetime
-    ) -> tuple[str, dict[str, Any]]:
-        upsert_sql = SQL_TEMPLATES["upsert_entity"]
+    ) -> tuple:
+        """Prepare a tuple for batch entity upsert."""
        source_id = item["source_id"]
        if isinstance(source_id, str) and "<SEP>" in source_id:
            chunk_ids = source_id.split("<SEP>")
        else:
            chunk_ids = [source_id]

-        data: dict[str, Any] = {
-            "workspace": self.workspace,
-            "id": item["__id__"],
-            "entity_name": item["entity_name"],
-            "content": item["content"],
-            "content_vector": json.dumps(item["__vector__"].tolist()),
-            "chunk_ids": chunk_ids,
-            "file_path": item.get("file_path", None),
-            "create_time": current_time,
-            "update_time": current_time,
-        }
-        return upsert_sql, data
+        return (
+            self.workspace,
+            item["__id__"],
+            item["entity_name"],
+            item["content"],
+            json.dumps(item["__vector__"].tolist()),
+            chunk_ids,
+            item.get("file_path", None),
+            current_time,
+            current_time,
+        )

-    def _upsert_relationships(
+    def _prepare_relationship_tuple(
        self, item: dict[str, Any], current_time: datetime.datetime
-    ) -> tuple[str, dict[str, Any]]:
-        upsert_sql = SQL_TEMPLATES["upsert_relationship"]
+    ) -> tuple:
+        """Prepare a tuple for batch relationship upsert."""
        source_id = item["source_id"]
        if isinstance(source_id, str) and "<SEP>" in source_id:
            chunk_ids = source_id.split("<SEP>")
        else:
            chunk_ids = [source_id]

-        data: dict[str, Any] = {
-            "workspace": self.workspace,
-            "id": item["__id__"],
-            "source_id": item["src_id"],
-            "target_id": item["tgt_id"],
-            "content": item["content"],
-            "content_vector": json.dumps(item["__vector__"].tolist()),
-            "chunk_ids": chunk_ids,
-            "file_path": item.get("file_path", None),
-            "create_time": current_time,
-            "update_time": current_time,
-        }
-        return upsert_sql, data
+        return (
+            self.workspace,
+            item["__id__"],
+            item["src_id"],
+            item["tgt_id"],
+            item["content"],
+            json.dumps(item["__vector__"].tolist()),
+            chunk_ids,
+            item.get("file_path", None),
+            current_time,
+            current_time,
+        )

    async def upsert(self, data: dict[str, dict[str, Any]]) -> None:
        logger.debug(f"[{self.workspace}] Inserting {len(data)} to {self.namespace}")
@ -2462,29 +2531,42 @@ class PGVectorStorage(BaseVectorStorage):
            }
            for k, v in data.items()
        ]
+
+        # Batch compute embeddings (already optimized)
        contents = [v["content"] for v in data.values()]
        batches = [
            contents[i : i + self._max_batch_size]
            for i in range(0, len(contents), self._max_batch_size)
        ]
-
        embedding_tasks = [self.embedding_func(batch) for batch in batches]
        embeddings_list = await asyncio.gather(*embedding_tasks)
-
        embeddings = np.concatenate(embeddings_list)
+
+        # Assign embeddings to items
        for i, d in enumerate(list_data):
            d["__vector__"] = embeddings[i]
-        for item in list_data:
-            if is_namespace(self.namespace, NameSpace.VECTOR_STORE_CHUNKS):
-                upsert_sql, data = self._upsert_chunks(item, current_time)
-            elif is_namespace(self.namespace, NameSpace.VECTOR_STORE_ENTITIES):
-                upsert_sql, data = self._upsert_entities(item, current_time)
-            elif is_namespace(self.namespace, NameSpace.VECTOR_STORE_RELATIONSHIPS):
-                upsert_sql, data = self._upsert_relationships(item, current_time)
-            else:
-                raise ValueError(f"{self.namespace} is not supported")

-            await self.db.execute(upsert_sql, data)
+        # Prepare batch data based on namespace and execute in single batch
+        if is_namespace(self.namespace, NameSpace.VECTOR_STORE_CHUNKS):
+            upsert_sql = SQL_TEMPLATES["upsert_chunk"]
+            batch_data = [
+                self._prepare_chunk_tuple(item, current_time) for item in list_data
+            ]
+        elif is_namespace(self.namespace, NameSpace.VECTOR_STORE_ENTITIES):
+            upsert_sql = SQL_TEMPLATES["upsert_entity"]
+            batch_data = [
+                self._prepare_entity_tuple(item, current_time) for item in list_data
+            ]
+        elif is_namespace(self.namespace, NameSpace.VECTOR_STORE_RELATIONSHIPS):
+            upsert_sql = SQL_TEMPLATES["upsert_relationship"]
+            batch_data = [
+                self._prepare_relationship_tuple(item, current_time)
+                for item in list_data
+            ]
+        else:
+            raise ValueError(f"{self.namespace} is not supported")
+
+        await self.db.executemany(upsert_sql, batch_data)

    #################### query method ###############
    async def query(
--- a/lightrag/lightrag.py
+++ b/lightrag/lightrag.py
@ -218,9 +218,11 @@ class LightRAG:
        )
    )

-    entity_resolution_config: EntityResolutionConfig | None = field(default=None)
+    entity_resolution_config: EntityResolutionConfig = field(
+        default_factory=EntityResolutionConfig
+    )
    """Configuration for entity resolution (deduplication).
-    Set to EntityResolutionConfig() to enable, or None to disable.
+    Now enabled by default. Set to None to disable.
    Resolves entities like 'FDA' → 'US Food and Drug Administration'."""

    # Orphan connection
--- a/lightrag/llm/openai.py
+++ b/lightrag/llm/openai.py
@ -365,7 +365,16 @@ async def openai_complete_if_cache(

                    delta = chunk.choices[0].delta
                    content = getattr(delta, "content", None)
-                    reasoning_content = getattr(delta, "reasoning_content", "")
+                    # Support both OpenAI's reasoning_content and OpenRouter's reasoning field
+                    reasoning_content = getattr(delta, "reasoning_content", "") or getattr(delta, "reasoning", "")
+                    # Also handle OpenRouter's reasoning_details array format
+                    if not reasoning_content:
+                        reasoning_details = getattr(delta, "reasoning_details", None)
+                        if reasoning_details and isinstance(reasoning_details, list):
+                            for detail in reasoning_details:
+                                if isinstance(detail, dict) and detail.get("text"):
+                                    reasoning_content = detail.get("text", "")
+                                    break

                    # Handle COT logic for streaming (only if enabled)
                    if enable_cot:
@ -527,7 +536,18 @@ async def openai_complete_if_cache(
            else:
                # Handle regular content responses
                content = getattr(message, "content", None)
-                reasoning_content = getattr(message, "reasoning_content", "")
+                # Support both OpenAI's reasoning_content and OpenRouter's reasoning field
+                reasoning_content = getattr(message, "reasoning_content", "") or getattr(message, "reasoning", "")
+                # Also handle OpenRouter's reasoning_details array format
+                if not reasoning_content:
+                    reasoning_details = getattr(message, "reasoning_details", None)
+                    if reasoning_details and isinstance(reasoning_details, list):
+                        # Concatenate all reasoning text for non-streaming
+                        reasoning_parts = []
+                        for detail in reasoning_details:
+                            if isinstance(detail, dict) and detail.get("text"):
+                                reasoning_parts.append(detail.get("text", ""))
+                        reasoning_content = "".join(reasoning_parts)

                # Handle COT logic for non-streaming responses (only if enabled)
                final_content = ""
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@ -5,6 +5,7 @@ from pathlib import Path
 import asyncio
 import json
 import json_repair
+import os
 import re
 from typing import Any, AsyncIterator, overload, Literal
 from collections import Counter, defaultdict
@ -78,18 +79,48 @@ import time
 import hashlib
 from dotenv import load_dotenv

-# Query embedding cache for avoiding redundant API calls
+# Query embedding cache configuration (configurable via environment variables)
+QUERY_EMBEDDING_CACHE_TTL = int(os.getenv("QUERY_EMBEDDING_CACHE_TTL", "3600"))  # 1 hour
+QUERY_EMBEDDING_CACHE_MAX_SIZE = int(os.getenv("QUERY_EMBEDDING_CACHE_SIZE", "10000"))
+
+# Redis cache configuration
+REDIS_EMBEDDING_CACHE_ENABLED = os.getenv("REDIS_EMBEDDING_CACHE", "false").lower() == "true"
+REDIS_URI = os.getenv("REDIS_URI", "redis://localhost:6379")
+
+# Local in-memory cache with LRU eviction
 # Structure: {query_hash: (embedding, timestamp)}
 _query_embedding_cache: dict[str, tuple[list[float], float]] = {}
-QUERY_EMBEDDING_CACHE_TTL = 3600  # 1 hour TTL
-QUERY_EMBEDDING_CACHE_MAX_SIZE = 1000  # Maximum cache entries
+
+# Global Redis client (lazy initialized)
+_redis_client = None


-async def get_cached_query_embedding(
-    query: str, embedding_func
-) -> list[float] | None:
+async def _get_redis_client():
+    """Lazy initialize Redis client."""
+    global _redis_client
+    if _redis_client is None and REDIS_EMBEDDING_CACHE_ENABLED:
+        try:
+            import redis.asyncio as redis
+
+            _redis_client = redis.from_url(REDIS_URI, decode_responses=True)
+            # Test connection
+            await _redis_client.ping()
+            logger.info(f"Redis embedding cache connected: {REDIS_URI}")
+        except ImportError:
+            logger.warning("Redis package not installed. Install with: pip install redis")
+            return None
+        except Exception as e:
+            logger.warning(f"Failed to connect to Redis: {e}. Falling back to local cache.")
+            return None
+    return _redis_client
+
+
+async def get_cached_query_embedding(query: str, embedding_func) -> list[float] | None:
    """Get query embedding with caching to avoid redundant API calls.

+    Supports both local in-memory cache and Redis for cross-worker sharing.
+    Redis is used when REDIS_EMBEDDING_CACHE=true environment variable is set.
+
    Args:
        query: The query string to embed
        embedding_func: The embedding function to call on cache miss
@ -99,11 +130,27 @@ async def get_cached_query_embedding(
    """
    query_hash = hashlib.sha256(query.encode()).hexdigest()[:16]
    current_time = time.time()
+    redis_key = f"lightrag:emb:{query_hash}"

-    # Check cache
+    # Try Redis cache first (if enabled)
+    if REDIS_EMBEDDING_CACHE_ENABLED:
+        try:
+            redis_client = await _get_redis_client()
+            if redis_client:
+                cached_json = await redis_client.get(redis_key)
+                if cached_json:
+                    embedding = json.loads(cached_json)
+                    logger.debug(f"Redis embedding cache hit for hash {query_hash[:8]}")
+                    # Also update local cache
+                    _query_embedding_cache[query_hash] = (embedding, current_time)
+                    return embedding
+        except Exception as e:
+            logger.debug(f"Redis cache read error: {e}")
+
+    # Check local cache
    cached = _query_embedding_cache.get(query_hash)
    if cached and (current_time - cached[1]) < QUERY_EMBEDDING_CACHE_TTL:
-        logger.debug(f"Query embedding cache hit for hash {query_hash[:8]}")
+        logger.debug(f"Local embedding cache hit for hash {query_hash[:8]}")
        return cached[0]

    # Cache miss - compute embedding
@ -111,7 +158,7 @@ async def get_cached_query_embedding(
        embedding = await embedding_func([query])
        embedding_result = embedding[0]  # Extract first from batch

-        # Manage cache size - simple eviction of oldest entries
+        # Manage local cache size - LRU eviction of oldest entries
        if len(_query_embedding_cache) >= QUERY_EMBEDDING_CACHE_MAX_SIZE:
            # Remove oldest 10% of entries
            sorted_entries = sorted(
@ -120,13 +167,30 @@ async def get_cached_query_embedding(
            for old_key, _ in sorted_entries[: QUERY_EMBEDDING_CACHE_MAX_SIZE // 10]:
                del _query_embedding_cache[old_key]

+        # Store in local cache
        _query_embedding_cache[query_hash] = (embedding_result, current_time)
-        logger.debug(f"Query embedding cached for hash {query_hash[:8]}")
+
+        # Store in Redis (if enabled)
+        if REDIS_EMBEDDING_CACHE_ENABLED:
+            try:
+                redis_client = await _get_redis_client()
+                if redis_client:
+                    await redis_client.setex(
+                        redis_key,
+                        QUERY_EMBEDDING_CACHE_TTL,
+                        json.dumps(embedding_result),
+                    )
+                    logger.debug(f"Embedding cached in Redis for hash {query_hash[:8]}")
+            except Exception as e:
+                logger.debug(f"Redis cache write error: {e}")
+
+        logger.debug(f"Query embedding computed and cached for hash {query_hash[:8]}")
        return embedding_result
    except Exception as e:
        logger.warning(f"Failed to compute query embedding: {e}")
        return None

+
 # use the .env that is inside the current folder
 # allows to use different .env file for each lightrag instance
 # the OS environment variables take precedence over the .env file
@ -1843,9 +1907,14 @@ async def _merge_nodes_then_upsert(
    llm_response_cache: BaseKVStorage | None = None,
    entity_chunks_storage: BaseKVStorage | None = None,
    pre_resolution_map: dict[str, str] | None = None,
+    prefetched_nodes: dict[str, dict] | None = None,
 ) -> tuple[dict, str | None]:
    """Get existing nodes from knowledge graph use name,if exists, merge data, else create, then upsert.

+    Args:
+        prefetched_nodes: Optional dict mapping entity names to their existing node data.
+            If provided, avoids individual get_node() calls for better performance.
+
    Returns:
        Tuple of (node_data, original_entity_name). original_entity_name is set if
        entity resolution changed the name (e.g., "Dupixant" → "Dupixent"),
@ -1969,8 +2038,12 @@ async def _merge_nodes_then_upsert(
    already_description = []
    already_file_paths = []

-    # 1. Get existing node data from knowledge graph
-    already_node = await knowledge_graph_inst.get_node(entity_name)
+    # 1. Get existing node data from knowledge graph (use prefetched if available)
+    if prefetched_nodes is not None and entity_name in prefetched_nodes:
+        already_node = prefetched_nodes[entity_name]
+    else:
+        # Fallback to individual fetch if not prefetched (e.g., after VDB resolution)
+        already_node = await knowledge_graph_inst.get_node(entity_name)
    if already_node:
        already_entity_types.append(already_node["entity_type"])
        already_source_ids.extend(already_node["source_id"].split(GRAPH_FIELD_SEP))
@ -2922,6 +2995,28 @@ async def merge_nodes_and_edges(
        pipeline_status["latest_message"] = log_message
        pipeline_status["history_messages"].append(log_message)

+    # ===== Batch Prefetch: Load existing entity data in single query =====
+    # Build list of entity names to prefetch (apply pre-resolution where applicable)
+    prefetch_entity_names = []
+    for entity_name in all_nodes.keys():
+        resolved_name = pre_resolution_map.get(entity_name, entity_name)
+        prefetch_entity_names.append(resolved_name)
+
+    # Batch fetch existing nodes to avoid N+1 query pattern during parallel processing
+    prefetched_nodes: dict[str, dict] = {}
+    if prefetch_entity_names:
+        try:
+            prefetched_nodes = await knowledge_graph_inst.get_nodes_batch(
+                prefetch_entity_names
+            )
+            logger.debug(
+                f"Prefetched {len(prefetched_nodes)}/{len(prefetch_entity_names)} "
+                f"existing entities for merge"
+            )
+        except Exception as e:
+            logger.warning(f"Batch entity prefetch failed: {e}. Falling back to individual fetches.")
+            prefetched_nodes = {}
+
    # Resolution map to track original→resolved entity names (e.g., "Dupixant"→"Dupixent")
    # This will be used to remap edge endpoints in Phase 2
    entity_resolution_map: dict[str, str] = {}
@ -2955,6 +3050,7 @@ async def merge_nodes_and_edges(
                        llm_response_cache,
                        entity_chunks_storage,
                        pre_resolution_map,
+                        prefetched_nodes,
                    )

                    # Track resolution mapping for edge remapping in Phase 2
@ -3941,7 +4037,7 @@ async def _perform_kg_search(
            query_embedding = await get_cached_query_embedding(
                query, actual_embedding_func
            )
-            if query_embedding:
+            if query_embedding is not None:
                logger.debug("Pre-computed query embedding for all vector operations")

    # Handle local and global modes
--- a/lightrag/prompt.py
+++ b/lightrag/prompt.py
@ -41,6 +41,7 @@ You are a Knowledge Graph Specialist responsible for extracting entities and rel
        *   Look for implicit categorical or thematic connections to other entities.
        *   Consider whether the entity belongs to a broader group or domain represented by other entities.
        *   Extract comparative relationships if the entity is mentioned alongside others.
+    *   **Attribution Verification:** When extracting relationships, ensure the source and target entities are correctly identified from the text. Do not conflate similar entities or transfer attributes from one entity to another.

 3.  **Delimiter Usage Protocol:**
    *   The `{tuple_delimiter}` is a complete, atomic marker and **must not be filled with content**. It serves strictly as a field separator.
@ -227,7 +228,43 @@ PROMPTS["fail_response"] = (
    "Sorry, I'm not able to provide an answer to that question.[no-context]"
 )

-PROMPTS["rag_response"] = """---Role---
+# Default RAG response prompt - cite-ready (no LLM-generated citations)
+# Citations are added by post-processing. This gives cleaner, more accurate results.
+PROMPTS["rag_response"] = """You're helping someone understand a topic. Write naturally, like explaining to a curious friend.
+
+STYLE RULES:
+- Flowing paragraphs, NOT bullets or numbered lists
+- Connect sentences with transitions (however, this means, for example)
+- Combine related facts into sentences rather than listing separately
+- Vary sentence length - mix short and long
+
+GOOD EXAMPLE:
+"Machine learning is a branch of AI that enables computers to learn from data without explicit programming. The field includes several approaches: supervised learning uses labeled data, while unsupervised learning finds hidden patterns. Deep learning, using multi-layer neural networks, has proven especially effective for image recognition and language processing."
+
+BAD EXAMPLE:
+"- Machine learning: branch of AI
+- Learns from data
+- Types: supervised, unsupervised
+- Deep learning uses neural networks"
+
+Answer using ONLY the context below. Do NOT include [1], [2] citations - they're added automatically.
+
+{user_prompt}
+
+Context:
+{context_data}
+"""
+
+# Strict mode suffix - append when response_type="strict"
+PROMPTS["rag_response_strict_suffix"] = """
+STRICT GROUNDING:
+- NEVER state specific numbers/dates unless they appear EXACTLY in context
+- If information isn't in context, say "not specified in available information"
+- Entity summaries for overview, Source Excerpts for precision
+"""
+
+# Legacy prompt with LLM-generated citations (for backward compatibility)
+PROMPTS["rag_response_with_llm_citations"] = """---Role---

 You are an expert AI assistant specializing in synthesizing information from a provided knowledge base. Your primary function is to answer user queries accurately by ONLY using the information within the provided **Context**.

@ -250,6 +287,8 @@ Consider the conversation history if provided to maintain conversational flow an
 2. Content & Grounding:
  - Strictly adhere to the provided context from the **Context**; DO NOT invent, assume, or infer any information not explicitly stated.
  - If the answer cannot be found in the **Context**, state that you do not have enough information to answer. Do not attempt to guess.
+  - CRITICAL FOR FACTS: When stating specific facts (dates, numbers, names, statistics), you MUST verify each fact appears EXACTLY in the provided context. If a specific date or number is not explicitly stated in the context, say "the exact [year/number/date] is not specified in the available information" rather than guessing.
+  - When the question asks "which" or "who" or "how many", provide ONLY the direct answer with facts from context. Do not elaborate with information not explicitly in the context.

 3. Formatting & Language:
  - The response MUST be in the same language as the user query.
@ -281,8 +320,45 @@ Consider the conversation history if provided to maintain conversational flow an
 {context_data}
 """

+# Default naive RAG response prompt - cite-ready (no LLM-generated citations)
 PROMPTS["naive_rag_response"] = """---Role---

+You are an expert AI assistant synthesizing information from a knowledge base.
+
+---Goal---
+
+Generate a comprehensive, well-structured answer to the user query using ONLY information from the provided Document Chunks.
+
+---Instructions---
+
+1. **Cite-Ready Writing Style**:
+   - Write each factual claim as a distinct, complete sentence
+   - DO NOT include citation markers like [1], [2], or footnote references
+   - DO NOT add a References section - citations will be added automatically by the system
+   - Each sentence should be traceable to specific information in the context
+
+2. **Content & Grounding**:
+   - Use ONLY information from the provided context
+   - DO NOT invent, assume, or infer any information not explicitly stated
+   - If the answer cannot be found in the context, state that clearly
+   - CRITICAL: Verify each fact appears EXACTLY in the provided context before stating it
+
+3. **Formatting**:
+   - The response MUST be in the same language as the user query
+   - Use Markdown formatting for clarity (headings, bullet points, bold)
+   - The response should be presented in {response_type}
+
+4. Additional Instructions: {user_prompt}
+
+
+---Context---
+
+{content_data}
+"""
+
+# Legacy naive RAG prompt with LLM-generated citations (for backward compatibility)
+PROMPTS["naive_rag_response_with_llm_citations"] = """---Role---
+
 You are an expert AI assistant specializing in synthesizing information from a provided knowledge base. Your primary function is to answer user queries accurately by ONLY using the information within the provided **Context**.

 ---Goal---
@ -304,6 +380,8 @@ Consider the conversation history if provided to maintain conversational flow an
 2. Content & Grounding:
  - Strictly adhere to the provided context from the **Context**; DO NOT invent, assume, or infer any information not explicitly stated.
  - If the answer cannot be found in the **Context**, state that you do not have enough information to answer. Do not attempt to guess.
+  - CRITICAL FOR FACTS: When stating specific facts (dates, numbers, names, statistics), you MUST verify each fact appears EXACTLY in the provided context. If a specific date or number is not explicitly stated in the context, say "the exact [year/number/date] is not specified in the available information" rather than guessing.
+  - When the question asks "which" or "who" or "how many", provide ONLY the direct answer with facts from context. Do not elaborate with information not explicitly in the context.

 3. Formatting & Language:
  - The response MUST be in the same language as the user query.
@ -335,30 +413,31 @@ Consider the conversation history if provided to maintain conversational flow an
 {content_data}
 """

+# Backward compatibility aliases - the default prompts are now cite-ready
+PROMPTS["cite_ready_rag_response"] = PROMPTS["rag_response"]
+PROMPTS["cite_ready_naive_rag_response"] = PROMPTS["naive_rag_response"]
+
 PROMPTS["kg_query_context"] = """
-Knowledge Graph Data (Entity):
+## Entity Summaries (use for definitions and general facts)

 ```json
 {entities_str}
 ```

-Knowledge Graph Data (Relationship):
+## Relationships (use to explain connections between concepts)

 ```json
 {relations_str}
 ```

-Document Chunks (Each entry has a reference_id refer to the `Reference Document List`):
+## Source Excerpts (use for specific facts, numbers, quotes)

 ```json
 {text_chunks_str}
 ```

-Reference Document List (Each entry starts with a [reference_id] that corresponds to entries in the Document Chunks):
-
-```
+## References
 {reference_list_str}
-```

 """

--- a/lightrag_webui/bun.lock
+++ b/lightrag_webui/bun.lock
@ -8,6 +8,7 @@
        "@radix-ui/react-alert-dialog": "^1.1.15",
        "@radix-ui/react-checkbox": "^1.3.3",
        "@radix-ui/react-dialog": "^1.1.15",
+        "@radix-ui/react-hover-card": "^1.1.15",
        "@radix-ui/react-popover": "^1.1.15",
        "@radix-ui/react-progress": "^1.1.7",
        "@radix-ui/react-scroll-area": "^1.2.10",
@ -318,6 +319,8 @@

    "@radix-ui/react-focus-scope": ["@radix-ui/react-focus-scope@1.1.7", "", { "dependencies": { "@radix-ui/react-compose-refs": "1.1.2", "@radix-ui/react-primitive": "2.1.3", "@radix-ui/react-use-callback-ref": "1.1.1" }, "peerDependencies": { "@types/react": "*", "@types/react-dom": "*", "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc", "react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc" }, "optionalPeers": ["@types/react", "@types/react-dom"] }, "sha512-t2ODlkXBQyn7jkl6TNaw/MtVEVvIGelJDCG41Okq/KwUsJBwQ4XVZsHAVUkK4mBv3ewiAS3PGuUWuY2BoK4ZUw=="],

+    "@radix-ui/react-hover-card": ["@radix-ui/react-hover-card@1.1.15", "", { "dependencies": { "@radix-ui/primitive": "1.1.3", "@radix-ui/react-compose-refs": "1.1.2", "@radix-ui/react-context": "1.1.2", "@radix-ui/react-dismissable-layer": "1.1.11", "@radix-ui/react-popper": "1.2.8", "@radix-ui/react-portal": "1.1.9", "@radix-ui/react-presence": "1.1.5", "@radix-ui/react-primitive": "2.1.3", "@radix-ui/react-use-controllable-state": "1.2.2" }, "peerDependencies": { "@types/react": "*", "@types/react-dom": "*", "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc", "react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc" }, "optionalPeers": ["@types/react", "@types/react-dom"] }, "sha512-qgTkjNT1CfKMoP0rcasmlH2r1DAiYicWsDsufxl940sT2wHNEWWv6FMWIQXWhVdmC1d/HYfbhQx60KYyAtKxjg=="],
+
    "@radix-ui/react-id": ["@radix-ui/react-id@1.1.1", "", { "dependencies": { "@radix-ui/react-use-layout-effect": "1.1.1" }, "peerDependencies": { "@types/react": "*", "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc" }, "optionalPeers": ["@types/react"] }, "sha512-kGkGegYIdQsOb4XjsfM97rXsiHaBwco+hFI66oO4s9LU+PLAC5oJ7khdOVFxkhsmlbpUqDAvXw11CluXP+jkHg=="],

    "@radix-ui/react-popover": ["@radix-ui/react-popover@1.1.15", "", { "dependencies": { "@radix-ui/primitive": "1.1.3", "@radix-ui/react-compose-refs": "1.1.2", "@radix-ui/react-context": "1.1.2", "@radix-ui/react-dismissable-layer": "1.1.11", "@radix-ui/react-focus-guards": "1.1.3", "@radix-ui/react-focus-scope": "1.1.7", "@radix-ui/react-id": "1.1.1", "@radix-ui/react-popper": "1.2.8", "@radix-ui/react-portal": "1.1.9", "@radix-ui/react-presence": "1.1.5", "@radix-ui/react-primitive": "2.1.3", "@radix-ui/react-slot": "1.2.3", "@radix-ui/react-use-controllable-state": "1.2.2", "aria-hidden": "^1.2.4", "react-remove-scroll": "^2.6.3" }, "peerDependencies": { "@types/react": "*", "@types/react-dom": "*", "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc", "react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc" }, "optionalPeers": ["@types/react", "@types/react-dom"] }, "sha512-kr0X2+6Yy/vJzLYJUPCZEc8SfQcf+1COFoAqauJm74umQhta9M7lNJHP7QQS3vkvcGLQUbWpMzwrXYwrYztHKA=="],
--- a/lightrag_webui/package-lock.json
+++ b/lightrag_webui/package-lock.json
--- a/lightrag_webui/package.json
+++ b/lightrag_webui/package.json
@ -20,6 +20,7 @@
    "@radix-ui/react-alert-dialog": "^1.1.15",
    "@radix-ui/react-checkbox": "^1.3.3",
    "@radix-ui/react-dialog": "^1.1.15",
+    "@radix-ui/react-hover-card": "^1.1.15",
    "@radix-ui/react-popover": "^1.1.15",
    "@radix-ui/react-progress": "^1.1.7",
    "@radix-ui/react-scroll-area": "^1.2.10",
@ -82,14 +83,14 @@
    "zustand": "^5.0.8"
  },
  "devDependencies": {
+    "@biomejs/biome": "^1.9.3",
    "@eslint/js": "^9.37.0",
    "@stylistic/eslint-plugin-js": "^3.1.0",
+    "@tailwindcss/typography": "^0.5.15",
    "@tailwindcss/vite": "^4.1.14",
    "@types/bun": "^1.2.23",
    "@types/katex": "^0.16.7",
    "@types/node": "^22.18.9",
-    "@biomejs/biome": "^1.9.3",
-    "@tailwindcss/typography": "^0.5.15",
    "@types/react": "^19.2.2",
    "@types/react-dom": "^19.2.1",
    "@types/react-i18next": "^8.1.0",
--- a/lightrag_webui/src/api/lightrag.ts
+++ b/lightrag_webui/src/api/lightrag.ts
@ -101,12 +101,47 @@ export type LightragDocumentsScanProgress = {
 */
 export type QueryMode = 'naive' | 'local' | 'global' | 'hybrid' | 'mix' | 'bypass'

+/**
+ * Citation marker with position data for frontend insertion
+ */
+export type CitationMarker = {
+  marker: string              // e.g., "[1]" or "[1,2]"
+  insert_position: number     // Character position to insert marker
+  reference_ids: string[]     // Reference IDs this marker cites
+  confidence: number          // Match confidence (0.0-1.0)
+  text_preview: string        // Preview of the cited text
+}
+
+/**
+ * Enhanced source metadata for hover cards
+ */
+export type CitationSource = {
+  reference_id: string
+  file_path: string
+  document_title: string | null
+  section_title: string | null
+  page_range: string | null
+  excerpt: string | null
+}
+
+/**
+ * Consolidated citation metadata from backend
+ */
+export type CitationsMetadata = {
+  markers: CitationMarker[]   // Position-based markers for insertion
+  sources: CitationSource[]   // Enhanced reference metadata
+  footnotes: string[]         // Pre-formatted footnote strings
+  uncited_count: number       // Number of claims without citations
+}
+
 export type Message = {
  role: 'user' | 'assistant' | 'system'
  content: string
  thinkingContent?: string
  displayContent?: string
  thinkingTime?: number | null
+  citationsProcessed?: boolean
+  citationsMetadata?: CitationsMetadata  // New consolidated citation data
 }

 export type QueryRequest = {
@ -142,6 +177,10 @@ export type QueryRequest = {
  user_prompt?: string
  /** Enable reranking for retrieved text chunks. If True but no rerank model is configured, a warning will be issued. Default is True. */
  enable_rerank?: boolean
+  /** Citation mode for post-processing citations. 'none' = no citations, 'inline' = [n] markers only, 'footnotes' = full footnotes with document titles */
+  citation_mode?: 'none' | 'inline' | 'footnotes'
+  /** Minimum similarity threshold (0.0-1.0) for matching response sentences to source chunks. Higher = stricter matching. Default is 0.7 */
+  citation_threshold?: number
 }

 export type QueryResponse = {
@ -409,7 +448,8 @@ export const queryText = async (request: QueryRequest): Promise<QueryResponse> =
 export const queryTextStream = async (
  request: QueryRequest,
  onChunk: (chunk: string) => void,
-  onError?: (error: string) => void
+  onError?: (error: string) => void,
+  onCitations?: (metadata: CitationsMetadata) => void
 ) => {
  const apiKey = useSettingsStore.getState().apiKey
  const token = localStorage.getItem('LIGHTRAG-API-TOKEN')
@ -486,7 +526,11 @@ export const queryTextStream = async (
              onChunk(parsed.response)
            } else if (parsed.error && onError) {
              onError(parsed.error)
+            } else if (parsed.citations_metadata && onCitations) {
+              // NEW: Handle consolidated citations_metadata object
+              onCitations(parsed.citations_metadata as CitationsMetadata)
            }
+            // Silently ignore references and other events
          } catch (error) {
            console.error('Error parsing stream chunk:', line, error)
            if (onError) onError(`Error parsing server response: ${line}`)
@ -503,6 +547,8 @@ export const queryTextStream = async (
          onChunk(parsed.response)
        } else if (parsed.error && onError) {
          onError(parsed.error)
+        } else if (parsed.citations_metadata && onCitations) {
+          onCitations(parsed.citations_metadata as CitationsMetadata)
        }
      } catch (error) {
        console.error('Error parsing final chunk:', buffer, error)
--- a/lightrag_webui/src/components/retrieval/ChatMessage.tsx
+++ b/lightrag_webui/src/components/retrieval/ChatMessage.tsx
@ -1,7 +1,7 @@
-import type { Message } from '@/api/lightrag'
+import type { CitationsMetadata, Message } from '@/api/lightrag'
 import useTheme from '@/hooks/useTheme'
 import { cn } from '@/lib/utils'
-import { type ReactNode, memo, useEffect, useMemo, useRef, useState } from 'react' // Import useMemo
+import { type ReactNode, memo, useCallback, useEffect, useMemo, useRef, useState } from 'react'

 import { remarkFootnotes } from '@/utils/remarkFootnotes'
 import mermaid from 'mermaid'
@ -14,8 +14,9 @@ import remarkMath from 'remark-math'
 import { Prism as SyntaxHighlighter } from 'react-syntax-highlighter'
 import { oneDark, oneLight } from 'react-syntax-highlighter/dist/cjs/styles/prism'

-import { ChevronDownIcon, LoaderIcon } from 'lucide-react'
+import { BrainIcon, ChevronDownIcon, LoaderIcon } from 'lucide-react'
 import { useTranslation } from 'react-i18next'
+import { CitationMarker } from './CitationMarker'

 // KaTeX configuration options interface
 interface KaTeXOptions {
@ -43,6 +44,71 @@ export type MessageWithError = Message & {
  latexRendered?: boolean
 }

+/**
+ * Helper component to render text with citation markers as interactive HoverCards.
+ * Parses [n] and [n,m] patterns and replaces them with CitationMarker components.
+ */
+function TextWithCitations({
+  children,
+  citationsMetadata,
+}: {
+  children: ReactNode
+  citationsMetadata?: CitationsMetadata
+}) {
+  // If no citation metadata or children is not a string, render as-is
+  if (!citationsMetadata || typeof children !== 'string') {
+    return <>{children}</>
+  }
+
+  const text = children
+  // Match citation patterns like [1], [2], [1,2], etc.
+  const citationPattern = /\[(\d+(?:,\d+)*)\]/g
+  const parts: ReactNode[] = []
+  let lastIndex = 0
+  let match: RegExpExecArray | null
+  let keyIndex = 0
+
+  while ((match = citationPattern.exec(text)) !== null) {
+    // Add text before the citation
+    if (match.index > lastIndex) {
+      parts.push(text.slice(lastIndex, match.index))
+    }
+
+    // Parse reference IDs from the marker
+    const markerText = match[0]
+    const refIds = match[1].split(',').map((id) => id.trim())
+
+    // Find matching marker data for confidence
+    const markerData = citationsMetadata.markers?.find((m) => m.marker === markerText)
+    const confidence = markerData?.confidence ?? 0.5
+
+    // Add the citation marker component
+    parts.push(
+      <CitationMarker
+        key={`citation-${keyIndex++}`}
+        marker={markerText}
+        referenceIds={refIds}
+        confidence={confidence}
+        sources={citationsMetadata.sources || []}
+      />
+    )
+
+    lastIndex = match.index + match[0].length
+  }
+
+  // Add remaining text
+  if (lastIndex < text.length) {
+    parts.push(text.slice(lastIndex))
+  }
+
+  // If no citations found, return original text
+  if (parts.length === 0) {
+    return <>{children}</>
+  }
+
+  return <>{parts}</>
+}
+
 // Restore original component definition and export
 export const ChatMessage = ({
  message,
@ -94,6 +160,9 @@ export const ChatMessage = ({
    loadKaTeX()
  }, [])

+  // Get citationsMetadata from message for use in markdown components
+  const citationsMetadata = message.citationsMetadata
+
  const mainMarkdownComponents = useMemo(
    () => ({
      code: (props: any) => {
@ -132,6 +201,11 @@ export const ChatMessage = ({
          </CodeHighlight>
        )
      },
+      // Custom text renderer that handles citation markers [n]
+      // Transforms plain text [1], [2], [1,2] into interactive CitationMarker components
+      text: ({ children }: { children?: ReactNode }) => (
+        <TextWithCitations citationsMetadata={citationsMetadata}>{children}</TextWithCitations>
+      ),
      p: ({ children }: { children?: ReactNode }) => <div className="my-2">{children}</div>,
      h1: ({ children }: { children?: ReactNode }) => (
        <h1 className="text-xl font-bold mt-4 mb-2">{children}</h1>
@ -153,7 +227,7 @@ export const ChatMessage = ({
      ),
      li: ({ children }: { children?: ReactNode }) => <li className="my-1">{children}</li>,
    }),
-    [message.mermaidRendered, message.role]
+    [message.mermaidRendered, message.role, citationsMetadata]
  )

  const thinkingMarkdownComponents = useMemo(
@ -179,48 +253,67 @@ export const ChatMessage = ({
            : 'w-[95%] bg-muted'
      } rounded-lg px-4 py-2`}
    >
-      {/* Thinking process display - only for assistant messages */}
-      {/* Always render to prevent layout shift when switching tabs */}
+      {/* Thinking Pill - collapsible bubble UI */}
      {message.role === 'assistant' && (isThinking || thinkingTime !== null) && (
-        <div
-          className={cn(
-            'mb-2',
-            // Reduce visual priority in inactive tabs while maintaining layout
-            !isTabActive && 'opacity-50'
-          )}
-        >
-          <div
-            className="flex items-center text-gray-500 dark:text-gray-400 hover:text-gray-700 dark:hover:text-gray-200 transition-colors duration-200 text-sm cursor-pointer select-none"
+        <div className={cn('mb-3', !isTabActive && 'opacity-50')}>
+          {/* Pill Header - always visible */}
+          <button
+            type="button"
            onClick={() => {
-              // Allow expansion when there's thinking content, even during thinking process
              if (finalThinkingContent && finalThinkingContent.trim() !== '') {
                setIsThinkingExpanded(!isThinkingExpanded)
              }
            }}
+            className={cn(
+              'inline-flex items-center gap-2 px-3 py-1.5 rounded-full text-xs font-medium transition-all',
+              'border shadow-sm select-none',
+              isThinking
+                ? 'bg-amber-50 border-amber-200 text-amber-700 dark:bg-amber-950/50 dark:border-amber-800 dark:text-amber-300'
+                : 'bg-slate-100 border-slate-200 text-slate-600 dark:bg-slate-800 dark:border-slate-700 dark:text-slate-300',
+              finalThinkingContent?.trim() && 'cursor-pointer hover:shadow-md'
+            )}
          >
            {isThinking ? (
              <>
-                {/* Only show spinner animation in active tab to save resources */}
-                {isTabActive && <LoaderIcon className="mr-2 size-4 animate-spin" />}
+                {isTabActive && (
+                  <div className="w-2 h-2 bg-amber-500 rounded-full animate-pulse" />
+                )}
                <span>{t('retrievePanel.chatMessage.thinking')}</span>
              </>
            ) : (
              typeof thinkingTime === 'number' && (
-                <span>{t('retrievePanel.chatMessage.thinkingTime', { time: thinkingTime })}</span>
+                <>
+                  <BrainIcon className="w-3.5 h-3.5" />
+                  <span>{t('retrievePanel.chatMessage.thinkingTime', { time: thinkingTime })}</span>
+                </>
              )
            )}
-            {/* Show chevron when there's thinking content, even during thinking process */}
            {finalThinkingContent && finalThinkingContent.trim() !== '' && (
              <ChevronDownIcon
-                className={`ml-2 size-4 shrink-0 transition-transform ${isThinkingExpanded ? 'rotate-180' : ''}`}
+                className={cn(
+                  'w-3.5 h-3.5 transition-transform',
+                  isThinkingExpanded && 'rotate-180'
+                )}
              />
            )}
-          </div>
-          {/* Show thinking content when expanded and content exists, even during thinking process */}
+          </button>
+
+          {/* Expandable Content */}
          {isThinkingExpanded && finalThinkingContent && finalThinkingContent.trim() !== '' && (
-            <div className="mt-2 pl-4 border-l-2 border-primary/20 dark:border-primary/40 text-sm prose dark:prose-invert max-w-none break-words prose-p:my-1 prose-headings:my-2 [&_sup]:text-[0.75em] [&_sup]:align-[0.1em] [&_sup]:leading-[0] [&_sub]:text-[0.75em] [&_sub]:align-[-0.2em] [&_sub]:leading-[0] [&_mark]:bg-yellow-200 [&_mark]:dark:bg-yellow-800 [&_u]:underline [&_del]:line-through [&_ins]:underline [&_ins]:decoration-green-500 [&_.footnotes]:mt-6 [&_.footnotes]:pt-3 [&_.footnotes]:border-t [&_.footnotes]:border-border [&_.footnotes_ol]:text-xs [&_.footnotes_li]:my-0.5 [&_a[href^='#fn']]:text-primary [&_a[href^='#fn']]:no-underline [&_a[href^='#fn']]:hover:underline [&_a[href^='#fnref']]:text-primary [&_a[href^='#fnref']]:no-underline [&_a[href^='#fnref']]:hover:underline text-foreground">
+            <div
+              className={cn(
+                'mt-2 ml-2 p-3 rounded-lg text-sm',
+                'bg-slate-50 border border-slate-200 dark:bg-slate-900 dark:border-slate-700',
+                'max-h-[400px] overflow-y-auto',
+                'prose dark:prose-invert max-w-none break-words prose-p:my-1 prose-headings:my-2',
+                '[&_sup]:text-[0.75em] [&_sup]:align-[0.1em] [&_sup]:leading-[0]',
+                '[&_sub]:text-[0.75em] [&_sub]:align-[-0.2em] [&_sub]:leading-[0]',
+                '[&_mark]:bg-yellow-200 [&_mark]:dark:bg-yellow-800',
+                'text-foreground'
+              )}
+            >
              {isThinking && (
-                <div className="mb-2 text-xs text-gray-400 dark:text-gray-300 italic">
+                <div className="mb-2 text-xs text-amber-600 dark:text-amber-400 italic">
                  {t('retrievePanel.chatMessage.thinkingInProgress', 'Thinking in progress...')}
                </div>
              )}
@ -238,16 +331,9 @@ export const ChatMessage = ({
                            displayMode: false,
                            strict: false,
                            trust: true,
-                            // Add silent error handling to avoid console noise
                            errorCallback: (error: string, latex: string) => {
-                              // Only show detailed errors in development environment
                              if (process.env.NODE_ENV === 'development') {
-                                console.warn(
-                                  'KaTeX rendering error in thinking content:',
-                                  error,
-                                  'for LaTeX:',
-                                  latex
-                                )
+                                console.warn('KaTeX error in thinking:', error, latex)
                              }
                            },
                          },
--- a/lightrag_webui/src/components/retrieval/CitationMarker.tsx
+++ b/lightrag_webui/src/components/retrieval/CitationMarker.tsx
@ -0,0 +1,161 @@
+/**
+ * CitationMarker Component
+ *
+ * Renders citation markers (e.g., [1]) as interactive hover cards
+ * showing source metadata like document title, section, page, and excerpt.
+ */
+
+import type { CitationSource } from '@/api/lightrag'
+import Badge from '@/components/ui/Badge'
+import { HoverCard, HoverCardContent, HoverCardTrigger } from '@/components/ui/HoverCard'
+import { FileTextIcon } from 'lucide-react'
+
+interface CitationMarkerProps {
+  /** The citation marker text, e.g., "[1]" or "[1,2]" */
+  marker: string
+  /** Reference IDs this marker cites */
+  referenceIds: string[]
+  /** Confidence score (0-1) */
+  confidence: number
+  /** Source metadata for hover card */
+  sources: CitationSource[]
+}
+
+/**
+ * Interactive citation marker with hover card showing source metadata
+ */
+export function CitationMarker({
+  marker,
+  referenceIds,
+  confidence,
+  sources,
+}: CitationMarkerProps) {
+  // Find sources matching our reference IDs
+  const matchingSources = sources.filter((s) => referenceIds.includes(s.reference_id))
+
+  // Confidence badge color based on score
+  const confidenceColor =
+    confidence >= 0.8
+      ? 'bg-green-100 text-green-800 dark:bg-green-900 dark:text-green-200'
+      : confidence >= 0.6
+        ? 'bg-yellow-100 text-yellow-800 dark:bg-yellow-900 dark:text-yellow-200'
+        : 'bg-red-100 text-red-800 dark:bg-red-900 dark:text-red-200'
+
+  return (
+    <HoverCard openDelay={200} closeDelay={100}>
+      <HoverCardTrigger asChild>
+        <button
+          type="button"
+          className="inline-flex items-center text-primary hover:text-primary/80 hover:underline cursor-pointer font-medium text-sm mx-0.5 focus:outline-none focus:ring-2 focus:ring-primary/20 rounded"
+        >
+          {marker}
+        </button>
+      </HoverCardTrigger>
+      <HoverCardContent className="w-80" side="top" align="center">
+        <div className="space-y-3">
+          {matchingSources.map((source) => (
+            <div key={source.reference_id} className="space-y-2">
+              {/* Document title */}
+              <div className="flex items-start gap-2">
+                <FileTextIcon className="w-4 h-4 mt-0.5 text-muted-foreground shrink-0" />
+                <h4 className="font-semibold text-sm leading-tight">
+                  {source.document_title || 'Untitled Document'}
+                </h4>
+              </div>
+
+              {/* Section title */}
+              {source.section_title && (
+                <p className="text-xs text-muted-foreground pl-6">
+                  Section: {source.section_title}
+                </p>
+              )}
+
+              {/* Page range */}
+              {source.page_range && (
+                <p className="text-xs text-muted-foreground pl-6">
+                  Pages: {source.page_range}
+                </p>
+              )}
+
+              {/* Excerpt */}
+              {source.excerpt && (
+                <blockquote className="pl-6 border-l-2 border-muted text-xs italic text-muted-foreground line-clamp-3">
+                  "{source.excerpt}"
+                </blockquote>
+              )}
+
+              {/* File path */}
+              <p className="text-xs text-muted-foreground/70 pl-6 truncate" title={source.file_path}>
+                {source.file_path}
+              </p>
+            </div>
+          ))}
+
+          {/* Confidence badge */}
+          <div className="flex items-center justify-between pt-2 border-t">
+            <span className="text-xs text-muted-foreground">Match confidence</span>
+            <Badge variant="outline" className={confidenceColor}>
+              {(confidence * 100).toFixed(0)}%
+            </Badge>
+          </div>
+        </div>
+      </HoverCardContent>
+    </HoverCard>
+  )
+}
+
+/**
+ * Parses text containing citation markers and returns React elements
+ * with interactive CitationMarker components.
+ *
+ * @param text - Text that may contain [n] or [n,m] patterns
+ * @param sources - Array of citation sources for hover card metadata
+ * @param markers - Array of citation markers with position and confidence data
+ * @returns Array of React elements (strings and CitationMarker components)
+ */
+export function renderTextWithCitations(
+  text: string,
+  sources: CitationSource[],
+  markers: Array<{ marker: string; reference_ids: string[]; confidence: number }>
+): React.ReactNode[] {
+  // Match citation patterns like [1], [2], [1,2], etc.
+  const citationPattern = /\[(\d+(?:,\d+)*)\]/g
+  const parts: React.ReactNode[] = []
+  let lastIndex = 0
+  let match: RegExpExecArray | null
+
+  while ((match = citationPattern.exec(text)) !== null) {
+    // Add text before the citation
+    if (match.index > lastIndex) {
+      parts.push(text.slice(lastIndex, match.index))
+    }
+
+    // Parse reference IDs from the marker
+    const markerText = match[0]
+    const refIds = match[1].split(',').map((id) => id.trim())
+
+    // Find matching marker data for confidence
+    const markerData = markers.find((m) => m.marker === markerText)
+    const confidence = markerData?.confidence ?? 0.5
+
+    // Add the citation marker component
+    parts.push(
+      <CitationMarker
+        key={`citation-${match.index}`}
+        marker={markerText}
+        referenceIds={refIds}
+        confidence={confidence}
+        sources={sources}
+      />
+    )
+
+    lastIndex = match.index + match[0].length
+  }
+
+  // Add remaining text
+  if (lastIndex < text.length) {
+    parts.push(text.slice(lastIndex))
+  }
+
+  return parts
+}
--- a/lightrag_webui/src/components/retrieval/QuerySettings.tsx
+++ b/lightrag_webui/src/components/retrieval/QuerySettings.tsx
@ -52,6 +52,8 @@ export default function QuerySettings() {
      max_entity_tokens: 6000,
      max_relation_tokens: 8000,
      max_total_tokens: 30000,
+      citation_mode: 'none' as 'none' | 'inline' | 'footnotes',
+      citation_threshold: 0.7,
    }),
    []
  )
@ -474,6 +476,87 @@ export default function QuerySettings() {
                />
              </div>
            </>
+
+            {/* Citation Settings */}
+            <>
+              <div className="pt-2 mt-2 border-t border-gray-200 dark:border-gray-700">
+                <TooltipProvider>
+                  <Tooltip>
+                    <TooltipTrigger asChild>
+                      <label htmlFor="citation_mode_select" className="ml-1 cursor-help">
+                        {t('retrievePanel.querySettings.citationMode')}
+                      </label>
+                    </TooltipTrigger>
+                    <TooltipContent side="left">
+                      <p>{t('retrievePanel.querySettings.citationModeTooltip')}</p>
+                    </TooltipContent>
+                  </Tooltip>
+                </TooltipProvider>
+              </div>
+              <div className="flex items-center gap-1">
+                <Select
+                  value={querySettings.citation_mode || 'none'}
+                  onValueChange={(v) => handleChange('citation_mode', v as 'none' | 'inline' | 'footnotes')}
+                >
+                  <SelectTrigger
+                    id="citation_mode_select"
+                    className="hover:bg-primary/5 h-9 cursor-pointer focus:ring-0 focus:ring-offset-0 focus:outline-0 active:right-0 flex-1 text-left [&>span]:break-all [&>span]:line-clamp-1"
+                  >
+                    <SelectValue />
+                  </SelectTrigger>
+                  <SelectContent>
+                    <SelectGroup>
+                      <SelectItem value="none">
+                        {t('retrievePanel.querySettings.citationModeOptions.none')}
+                      </SelectItem>
+                      <SelectItem value="inline">
+                        {t('retrievePanel.querySettings.citationModeOptions.inline')}
+                      </SelectItem>
+                      <SelectItem value="footnotes">
+                        {t('retrievePanel.querySettings.citationModeOptions.footnotes')}
+                      </SelectItem>
+                    </SelectGroup>
+                  </SelectContent>
+                </Select>
+                <ResetButton onClick={() => handleReset('citation_mode')} title="Reset to default (None)" />
+              </div>
+
+              {/* Citation Threshold - only show when citation mode is not 'none' */}
+              {querySettings.citation_mode && querySettings.citation_mode !== 'none' && (
+                <>
+                  <TooltipProvider>
+                    <Tooltip>
+                      <TooltipTrigger asChild>
+                        <label htmlFor="citation_threshold" className="ml-1 cursor-help">
+                          {t('retrievePanel.querySettings.citationThreshold')}
+                        </label>
+                      </TooltipTrigger>
+                      <TooltipContent side="left">
+                        <p>{t('retrievePanel.querySettings.citationThresholdTooltip')}</p>
+                      </TooltipContent>
+                    </Tooltip>
+                  </TooltipProvider>
+                  <div className="flex items-center gap-1">
+                    <Input
+                      id="citation_threshold"
+                      type="number"
+                      step="0.05"
+                      min="0"
+                      max="1"
+                      value={querySettings.citation_threshold ?? 0.7}
+                      onChange={(e) => {
+                        const value = parseFloat(e.target.value)
+                        if (!isNaN(value) && value >= 0 && value <= 1) {
+                          handleChange('citation_threshold', value)
+                        }
+                      }}
+                      className="h-9 flex-1 pr-2 [&::-webkit-outer-spin-button]:appearance-none [&::-webkit-inner-spin-button]:appearance-none [-moz-appearance:textfield]"
+                    />
+                    <ResetButton onClick={() => handleReset('citation_threshold')} title="Reset to default (0.7)" />
+                  </div>
+                </>
+              )}
+            </>
          </div>
        </div>
      </CardContent>
--- a/lightrag_webui/src/components/ui/HoverCard.tsx
+++ b/lightrag_webui/src/components/ui/HoverCard.tsx
@ -0,0 +1,33 @@
+import { cn } from '@/lib/utils'
+import * as HoverCardPrimitive from '@radix-ui/react-hover-card'
+import * as React from 'react'
+
+const HoverCard = HoverCardPrimitive.Root
+
+const HoverCardTrigger = HoverCardPrimitive.Trigger
+
+const HoverCardContent = React.forwardRef<
+  React.ComponentRef<typeof HoverCardPrimitive.Content>,
+  React.ComponentPropsWithoutRef<typeof HoverCardPrimitive.Content>
+>(({ className, align = 'center', sideOffset = 4, ...props }, ref) => (
+  <HoverCardPrimitive.Content
+    ref={ref}
+    align={align}
+    sideOffset={sideOffset}
+    className={cn(
+      'z-50 w-64 rounded-md border bg-popover p-4 text-popover-foreground shadow-md outline-none',
+      'data-[state=open]:animate-in data-[state=closed]:animate-out',
+      'data-[state=closed]:fade-out-0 data-[state=open]:fade-in-0',
+      'data-[state=closed]:zoom-out-95 data-[state=open]:zoom-in-95',
+      'data-[side=bottom]:slide-in-from-top-2',
+      'data-[side=left]:slide-in-from-right-2',
+      'data-[side=right]:slide-in-from-left-2',
+      'data-[side=top]:slide-in-from-bottom-2',
+      className
+    )}
+    {...props}
+  />
+))
+HoverCardContent.displayName = HoverCardPrimitive.Content.displayName
+
+export { HoverCard, HoverCardTrigger, HoverCardContent }
--- a/lightrag_webui/src/features/RetrievalTesting.tsx
+++ b/lightrag_webui/src/features/RetrievalTesting.tsx
@ -1,5 +1,5 @@
 import { queryText, queryTextStream } from '@/api/lightrag'
-import type { QueryMode } from '@/api/lightrag'
+import type { CitationsMetadata, QueryMode } from '@/api/lightrag'
 import { ChatMessage, type MessageWithError } from '@/components/retrieval/ChatMessage'
 import QuerySettings from '@/components/retrieval/QuerySettings'
 import Button from '@/components/ui/Button'
@ -231,6 +231,7 @@ export default function RetrievalTesting() {
        thinkingContent: undefined, // Explicitly initialize to undefined
        displayContent: undefined, // Explicitly initialize to undefined
        isThinking: false, // Explicitly initialize to false
+        citationsProcessed: false, // Prevent finally block from overwriting citation content
      }

      const prevMessages = [...messages]
@ -373,9 +374,66 @@ export default function RetrievalTesting() {
        // Run query
        if (state.querySettings.stream) {
          let errorMessage = ''
-          await queryTextStream(queryParams, updateAssistantMessage, (error) => {
-            errorMessage += error
-          })
+          await queryTextStream(
+            queryParams,
+            updateAssistantMessage,
+            (error) => {
+              errorMessage += error
+            },
+            // Citation callback - use position markers to insert citations client-side
+            // NEW: No longer receives annotated_response (which duplicated payload)
+            // Instead receives position metadata for client-side marker insertion
+            (() => {
+              let citationsApplied = false
+              return (metadata: CitationsMetadata) => {
+                // Guard against multiple invocations
+                if (citationsApplied || !metadata.markers || metadata.markers.length === 0) return
+                citationsApplied = true
+
+                // Insert markers into the accumulated response using position data
+                // Sort by position descending so we can insert from end to start (preserves positions)
+                const sortedMarkers = [...metadata.markers].sort(
+                  (a, b) => b.insert_position - a.insert_position
+                )
+
+                let annotatedContent = assistantMessage.content
+                for (const marker of sortedMarkers) {
+                  // Insert marker at the specified position
+                  if (marker.insert_position <= annotatedContent.length) {
+                    annotatedContent =
+                      annotatedContent.slice(0, marker.insert_position) +
+                      marker.marker +
+                      annotatedContent.slice(marker.insert_position)
+                  }
+                }
+
+                // Append footnotes if provided
+                let finalContent = annotatedContent
+                if (metadata.footnotes && metadata.footnotes.length > 0) {
+                  finalContent += '\n\n---\n\n**References:**\n' + metadata.footnotes.join('\n')
+                }
+
+                // Update message with annotated content and store citation metadata for HoverCards
+                setMessages((prev) =>
+                  prev.map((msg) =>
+                    msg.id === assistantMessage.id
+                      ? {
+                          ...msg,
+                          content: finalContent,
+                          displayContent: finalContent,
+                          citationsProcessed: true,
+                          citationsMetadata: metadata, // Store for HoverCard rendering
+                        }
+                      : msg
+                  )
+                )
+                // Also update the local reference for final cleanup operations
+                assistantMessage.content = finalContent
+                assistantMessage.displayContent = finalContent
+                assistantMessage.citationsProcessed = true
+              }
+            })()
+          )
          if (errorMessage) {
            if (assistantMessage.content) {
              errorMessage = assistantMessage.content + '\n' + errorMessage
@ -413,7 +471,8 @@ export default function RetrievalTesting() {
          }

          // Ensure display content is correctly set based on final parsing
-          if (finalCotResult.displayContent !== undefined) {
+          // BUT skip if citations were processed (they already set displayContent)
+          if (!assistantMessage.citationsProcessed && finalCotResult.displayContent !== undefined) {
            assistantMessage.displayContent = finalCotResult.displayContent
          }
        } catch (error) {
--- a/lightrag_webui/src/lib/constants.ts
+++ b/lightrag_webui/src/lib/constants.ts
@ -5,18 +5,19 @@ export const webuiPrefix = '/webui/'

 export const controlButtonVariant: ButtonVariantType = 'ghost'

-export const labelColorDarkTheme = '#FFFFFF'
-export const LabelColorHighlightedDarkTheme = '#000000'
+// Dark theme graph palette tuned for contrast on charcoal backgrounds
+export const labelColorDarkTheme = '#E5ECFF'
+export const LabelColorHighlightedDarkTheme = '#0F172A'
 export const labelColorLightTheme = '#000'

-export const nodeColorDisabled = '#E2E2E2'
-export const nodeBorderColor = '#EEEEEE'
-export const nodeBorderColorSelected = '#F57F17'
+export const nodeColorDisabled = '#9CA3AF'
+export const nodeBorderColor = '#CBD5E1'
+export const nodeBorderColorSelected = '#F97316'
 export const nodeBorderColorHiddenConnections = '#F59E0B' // Amber color for nodes with hidden connections

-export const edgeColorDarkTheme = '#888888'
-export const edgeColorSelected = '#F57F17'
-export const edgeColorHighlightedDarkTheme = '#F57F17'
+export const edgeColorDarkTheme = '#4B5563'
+export const edgeColorSelected = '#F97316'
+export const edgeColorHighlightedDarkTheme = '#F59E0B'
 export const edgeColorHighlightedLightTheme = '#F57F17'

 export const searchResultLimit = 50
--- a/lightrag_webui/src/locales/en.json
+++ b/lightrag_webui/src/locales/en.json
@ -494,7 +494,16 @@
      "userPromptTooltip": "Provide additional response requirements to the LLM (unrelated to query content, only for output processing).",
      "userPromptPlaceholder": "Enter custom prompt (optional)",
      "enableRerank": "Enable Rerank",
-      "enableRerankTooltip": "Enable reranking for retrieved text chunks. If True but no rerank model is configured, a warning will be issued. Default is True."
+      "enableRerankTooltip": "Enable reranking for retrieved text chunks. If True but no rerank model is configured, a warning will be issued. Default is True.",
+      "citationMode": "Citation Mode",
+      "citationModeTooltip": "Add source citations to responses:\n• None: No citations (fastest)\n• Inline: [n] markers in text only\n• Footnotes: Full footnotes with document titles",
+      "citationModeOptions": {
+        "none": "None",
+        "inline": "Inline [n]",
+        "footnotes": "Footnotes"
+      },
+      "citationThreshold": "Citation Threshold",
+      "citationThresholdTooltip": "Minimum similarity score (0-1) for matching sentences to sources. Higher = stricter matching, fewer citations. Default: 0.7"
    }
  },
  "apiSite": {
--- a/lightrag_webui/src/stores/settings.ts
+++ b/lightrag_webui/src/stores/settings.ts
@ -161,6 +161,8 @@ const useSettingsStoreBase = create<SettingsState>()(
        history_turns: 0,
        user_prompt: '',
        enable_rerank: true,
+        citation_mode: 'none',
+        citation_threshold: 0.7,
      },

      setTheme: (theme: Theme) => set({ theme }),
@ -303,7 +305,7 @@ const useSettingsStoreBase = create<SettingsState>()(
    {
      name: 'settings-storage',
      storage: createJSONStorage(() => localStorage),
-      version: 23,
+      version: 24,
      migrate: (state: any, version: number) => {
        if (version < 2) {
          state.showEdgeLabel = false
@ -428,6 +430,13 @@ const useSettingsStoreBase = create<SettingsState>()(
          // Add expand depth setting for Load Connections
          state.graphExpandDepth = 1
        }
+        if (version < 24) {
+          // Add citation settings for post-processing citations
+          if (state.querySettings) {
+            state.querySettings.citation_mode = 'none'
+            state.querySettings.citation_threshold = 0.7
+          }
+        }
        return state
      },
    }