feat: add automatic entity resolution with 3-layer matching

Implement automatic entity resolution to prevent duplicate nodes in the knowledge graph. The system uses a 3-layer approach: 1. Case-insensitive exact matching (free, instant) 2. Fuzzy string matching >85% threshold (free, instant) 3. Vector similarity + LLM verification (for acronyms/synonyms) Key features: - Pre-resolution phase prevents race conditions in parallel processing - Numeric suffix detection blocks false matches (IL-4 ≠ IL-13) - PostgreSQL alias cache for fast lookups on subsequent ingestion - Configurable thresholds via environment variables Bug fixes included: - Fix fuzzy matching false positives for numbered entities - Fix alias cache not being populated (missing db parameter) - Skip entity_aliases table from generic id index creation New files: - lightrag/entity_resolution/ - Core resolution module - tests/test_entity_resolution/ - Unit tests - docker/postgres-age-vector/ - Custom PG image with pgvector + AGE - docker-compose.test.yml - Integration test environment Configuration (env.example): - ENTITY_RESOLUTION_ENABLED=true - ENTITY_RESOLUTION_FUZZY_THRESHOLD=0.85 - ENTITY_RESOLUTION_VECTOR_THRESHOLD=0.5 - ENTITY_RESOLUTION_MAX_CANDIDATES=3
2025-11-27 12:43:45 +01:00 · 2025-11-27 12:43:45 +01:00 · 48c7732edc
commit 48c7732edc
parent 4f12fe121d
20 changed files with 1561 additions and 101 deletions
--- a/docker-compose.test.yml
+++ b/docker-compose.test.yml
@ -0,0 +1,84 @@
 name: lightrag-entity-resolution-test
 services:
  postgres:
    container_name: lightrag-postgres
    build:
      context: ./docker/postgres-age-vector
      dockerfile: Dockerfile
    environment:
      POSTGRES_DB: lightrag
      POSTGRES_USER: lightrag
      POSTGRES_PASSWORD: lightrag_pass
    ports:
      - "5433:5432"  # Use 5433 to avoid conflict with agent-sdk postgres
    volumes:
      - pgdata_test:/var/lib/postgresql/data
    healthcheck:
      test: ["CMD-SHELL", "pg_isready -U lightrag -d lightrag"]
      interval: 5s
      timeout: 5s
      retries: 5
  lightrag:
    container_name: lightrag-test
    build:
      context: .
      dockerfile: Dockerfile
    ports:
      - "9622:9621"  # Use 9622 to avoid conflict
    volumes:
      - ./data/rag_storage_test:/app/data/rag_storage
      - ./data/inputs_test:/app/data/inputs
    environment:
      # Server
      - HOST=0.0.0.0
      - PORT=9621
      - LOG_LEVEL=DEBUG
      # LLM (OpenAI)
      - LLM_BINDING=openai
      - LLM_MODEL=gpt-4o-mini
      - LLM_BINDING_HOST=https://api.openai.com/v1
      - LLM_BINDING_API_KEY=${OPENAI_API_KEY}
      # Embedding
      - EMBEDDING_BINDING=openai
      - EMBEDDING_MODEL=text-embedding-3-small
      - EMBEDDING_DIM=1536
      - EMBEDDING_BINDING_HOST=https://api.openai.com/v1
      - EMBEDDING_BINDING_API_KEY=${OPENAI_API_KEY}
      # Storage Configuration - Full PostgreSQL!
      # Custom postgres image has pgvector + Apache AGE
      - LIGHTRAG_KV_STORAGE=PGKVStorage
      - LIGHTRAG_VECTOR_STORAGE=PGVectorStorage
      - LIGHTRAG_GRAPH_STORAGE=PGGraphStorage
      - LIGHTRAG_DOC_STATUS_STORAGE=PGDocStatusStorage
      - POSTGRES_HOST=postgres
      - POSTGRES_PORT=5432
      - POSTGRES_USER=lightrag
      - POSTGRES_PASSWORD=lightrag_pass
      - POSTGRES_DATABASE=lightrag
      # Entity Resolution - ENABLED!
      - ENTITY_RESOLUTION_ENABLED=true
      - ENTITY_RESOLUTION_FUZZY_THRESHOLD=0.85
      - ENTITY_RESOLUTION_VECTOR_THRESHOLD=0.5
      - ENTITY_RESOLUTION_MAX_CANDIDATES=3
      # Processing
      - MAX_ASYNC=4
      - CHUNK_SIZE=1200
    depends_on:
      postgres:
        condition: service_healthy
    healthcheck:
      test: ["CMD-SHELL", "curl -f http://localhost:9621/health || exit 1"]
      interval: 10s
      timeout: 5s
      retries: 10
      start_period: 30s
 volumes:
  pgdata_test:
--- a/docker/postgres-age-vector/Dockerfile
+++ b/docker/postgres-age-vector/Dockerfile
@ -0,0 +1,26 @@
 # Start from pgvector image (has vector extension pre-built correctly)
 FROM pgvector/pgvector:pg17
 # Install build dependencies for AGE
 RUN apt-get update && apt-get install -y \
    build-essential \
    git \
    postgresql-server-dev-17 \
    libreadline-dev \
    zlib1g-dev \
    flex \
    bison \
    && rm -rf /var/lib/apt/lists/*
 # Install Apache AGE 1.6.0 for PG17
 RUN cd /tmp \
    && git clone --branch release/PG17/1.6.0 https://github.com/apache/age.git \
    && cd age \
    && make \
    && make install \
    && rm -rf /tmp/age
 # Add initialization script to create extensions
 RUN echo "CREATE EXTENSION IF NOT EXISTS vector;" > /docker-entrypoint-initdb.d/01-vector.sql \
    && echo "CREATE EXTENSION IF NOT EXISTS age;" > /docker-entrypoint-initdb.d/02-age.sql \
    && echo "SET search_path = ag_catalog, public;" >> /docker-entrypoint-initdb.d/02-age.sql
--- a/env.example
+++ b/env.example
@ -127,6 +127,16 @@ SUMMARY_LANGUAGE=English
 ### Entity types that the LLM will attempt to recognize
 # ENTITY_TYPES='["Person", "Creature", "Organization", "Location", "Event", "Concept", "Method", "Content", "Data", "Artifact", "NaturalObject"]'
 ###########################################################
 ### Entity Resolution Configuration
 ### Automatically deduplicates entities (e.g., "FDA" = "US Food and Drug Administration")
 ### Uses 3-layer approach: case normalization → fuzzy matching → LLM verification
 ###########################################################
 # ENTITY_RESOLUTION_ENABLED=true
 # ENTITY_RESOLUTION_FUZZY_THRESHOLD=0.85
 # ENTITY_RESOLUTION_VECTOR_THRESHOLD=0.5
 # ENTITY_RESOLUTION_MAX_CANDIDATES=3
 ### Chunk size for document splitting, 500~1500 is recommended
 # CHUNK_SIZE=1200
 # CHUNK_OVERLAP_SIZE=100
--- a/examples/test_entity_resolution.py
+++ b/examples/test_entity_resolution.py
@ -0,0 +1,93 @@
 """
 Quick test for Entity Resolution feature.
 Tests that:
 1. "FDA" and "US Food and Drug Administration" resolve to the same entity
 2. "Dupixant" (typo) matches "Dupixent" via fuzzy matching
 """
 import asyncio
 import os
 import shutil
 from lightrag import LightRAG
 from lightrag.entity_resolution import EntityResolutionConfig
 from lightrag.llm.openai import gpt_4o_mini_complete, openai_embed
 from lightrag.utils import logger
 import logging
 WORKING_DIR = "./test_entity_resolution"
 # Test document with entities that should be deduplicated
 TEST_DOC = """
 The FDA approved Dupixent for treating eczema in 2017.
 The US Food and Drug Administration later expanded the drug's indications.
 Dupixant (sometimes misspelled) has shown good results in clinical trials.
 The FDA continues to monitor the safety of Dupixent.
 """
 async def main():
    if not os.getenv("OPENAI_API_KEY"):
        print("Error: Set OPENAI_API_KEY environment variable")
        return
    # Clean up previous test
    if os.path.exists(WORKING_DIR):
        shutil.rmtree(WORKING_DIR)
    os.makedirs(WORKING_DIR)
    # Set up logging to see resolution messages
    logging.basicConfig(level=logging.DEBUG)
    logger.setLevel(logging.DEBUG)
    print("\n" + "=" * 60)
    print("Entity Resolution Test")
    print("=" * 60)
    rag = LightRAG(
        working_dir=WORKING_DIR,
        embedding_func=openai_embed,
        llm_model_func=gpt_4o_mini_complete,
        entity_resolution_config=EntityResolutionConfig(
            enabled=True,
            fuzzy_threshold=0.85,
            vector_threshold=0.5,
            max_candidates=3,
        ),
    )
    await rag.initialize_storages()
    print("\nInserting test document...")
    print(f"Document: {TEST_DOC.strip()}")
    print("\n" + "-" * 60)
    await rag.ainsert(TEST_DOC)
    print("\n" + "-" * 60)
    print("Checking extracted entities...")
    # Read the graph to see what entities were created
    graph_file = os.path.join(WORKING_DIR, "graph_chunk_entity_relation.graphml")
    if os.path.exists(graph_file):
        import networkx as nx
        G = nx.read_graphml(graph_file)
        print(f"\nEntities in graph ({len(G.nodes())} total):")
        for node in sorted(G.nodes()):
            print(f"  - {node}")
        print(f"\nRelationships: {len(G.edges())}")
    else:
        print("Graph file not found")
    await rag.finalize_storages()
    print("\n" + "=" * 60)
    print("Test complete!")
    print("=" * 60)
 if __name__ == "__main__":
    asyncio.run(main())
--- a/lightrag/api/config.py
+++ b/lightrag/api/config.py
@ -450,6 +450,20 @@ def parse_args() -> argparse.Namespace:
        "EMBEDDING_TOKEN_LIMIT", None, int, special_none=True
    )
    # Entity Resolution configuration
    args.entity_resolution_enabled = get_env_value(
        "ENTITY_RESOLUTION_ENABLED", False, bool
    )
    args.entity_resolution_fuzzy_threshold = get_env_value(
        "ENTITY_RESOLUTION_FUZZY_THRESHOLD", 0.85, float
    )
    args.entity_resolution_vector_threshold = get_env_value(
        "ENTITY_RESOLUTION_VECTOR_THRESHOLD", 0.5, float
    )
    args.entity_resolution_max_candidates = get_env_value(
        "ENTITY_RESOLUTION_MAX_CANDIDATES", 3, int
    )
    ollama_server_infos.LIGHTRAG_NAME = args.simulated_model_name
    ollama_server_infos.LIGHTRAG_TAG = args.simulated_model_tag
--- a/lightrag/api/lightrag_server.py
+++ b/lightrag/api/lightrag_server.py
@ -2,6 +2,8 @@
 LightRAG FastAPI Server
 """
 from __future__ import annotations
 from fastapi import FastAPI, Depends, HTTPException, Request
 from fastapi.exceptions import RequestValidationError
 from fastapi.responses import JSONResponse
@ -642,6 +644,23 @@ def create_app(args):
                raise Exception(f"Failed to import {binding} options: {e}")
        return {}
    def create_entity_resolution_config(args) -> object | None:
        """
        Create EntityResolutionConfig from command line/env arguments.
        Returns None if entity resolution is disabled.
        """
        if not args.entity_resolution_enabled:
            return None
        from lightrag.entity_resolution import EntityResolutionConfig
        return EntityResolutionConfig(
            enabled=True,
            fuzzy_threshold=args.entity_resolution_fuzzy_threshold,
            vector_threshold=args.entity_resolution_vector_threshold,
            max_candidates=args.entity_resolution_max_candidates,
        )
    def create_optimized_embedding_function(
        config_cache: LLMConfigCache, binding, model, host, api_key, args
    ) -> EmbeddingFunc:
@ -1029,6 +1048,7 @@ def create_app(args):
                "entity_types": args.entity_types,
            },
            ollama_server_infos=ollama_server_infos,
            entity_resolution_config=create_entity_resolution_config(args),
        )
    except Exception as e:
        logger.error(f"Failed to initialize LightRAG: {e}")
--- a/lightrag/api/routers/document_routes.py
+++ b/lightrag/api/routers/document_routes.py
@ -443,7 +443,7 @@ class DocStatusResponse(BaseModel):
    metadata: Optional[dict[str, Any]] = Field(
        default=None, description="Additional metadata about the document"
    )
-    file_path: str = Field(description="Path to the document file")
+    file_path: Optional[str] = Field(default=None, description="Path to the document file")
    class Config:
        json_schema_extra = {
--- a/lightrag/entity_resolution/init.py
+++ b/lightrag/entity_resolution/init.py
@ -0,0 +1,29 @@
 """
 Entity Resolution Module for LightRAG
 Provides automatic entity deduplication using a 3-layer approach:
 1. Case normalization (exact match)
 2. Fuzzy string matching (typos)
 3. Vector similarity + LLM verification (semantic matches)
 """
 from .resolver import (
    resolve_entity,
    resolve_entity_with_vdb,
    ResolutionResult,
    get_cached_alias,
    store_alias,
    fuzzy_similarity,
 )
 from .config import EntityResolutionConfig, DEFAULT_CONFIG
 __all__ = [
    "resolve_entity",
    "resolve_entity_with_vdb",
    "ResolutionResult",
    "EntityResolutionConfig",
    "DEFAULT_CONFIG",
    "get_cached_alias",
    "store_alias",
    "fuzzy_similarity",
 ]
--- a/lightrag/entity_resolution/config.py
+++ b/lightrag/entity_resolution/config.py
@ -0,0 +1,57 @@
 """Configuration for Entity Resolution
 Uses the same LLM that LightRAG is configured with - no separate model config needed.
 """
 from dataclasses import dataclass, field
@dataclass
 class EntityResolutionConfig:
    """Configuration for the entity resolution system."""
    # Whether entity resolution is enabled
    enabled: bool = True
    # Fuzzy pre-resolution: Enable/disable within-batch fuzzy matching before
    # VDB lookup. When enabled, entities in the same batch are matched by string
    # similarity alone. Set to False to skip fuzzy pre-resolution entirely (only
    # exact case-insensitive matches will be accepted within batch; all other
    # resolution goes to VDB/LLM). Disabling reduces false positives but may
    # miss obvious typo corrections.
    fuzzy_pre_resolution_enabled: bool = True
    # Fuzzy string matching threshold (0-1)
    # Above this = auto-match (catches typos like Dupixant/Dupixent at 0.88)
    # Below this = continue to vector search
    # Tuning advice:
    #   0.90+ = Very conservative, near-identical strings (Dupixent/Dupixant)
    #   0.85  = Balanced default, catches typos, avoids most false positives
    #   0.80  = Aggressive, may merge distinct entities with similar names
    #   <0.75 = Not recommended, high false positive risk (Celebrex/Cerebyx=0.67)
    # Test with your domain data; pharmaceutical names need higher thresholds.
    fuzzy_threshold: float = 0.85
    # Vector similarity threshold for finding candidates
    # Low threshold = cast wide net, LLM will verify
    # 0.5 catches FDA/US Food and Drug Administration at 0.67
    vector_threshold: float = 0.5
    # Maximum number of vector candidates to verify with LLM
    # Limits cost - uses same LLM as LightRAG main config
    max_candidates: int = 3
    # LLM verification prompt template
    llm_prompt_template: str = field(
        default="""Are these two terms referring to the same entity?
 Consider typos, misspellings, abbreviations, or alternate names.
 Term A: {term_a}
 Term B: {term_b}
 Answer only YES or NO.""",
    )
 # Default configuration
 DEFAULT_CONFIG = EntityResolutionConfig()
--- a/lightrag/entity_resolution/resolver.py
+++ b/lightrag/entity_resolution/resolver.py
@ -0,0 +1,335 @@
 """Entity Resolution - 3-Layer Approach
 Layer 1: Case normalization (exact match)
 Layer 2: Fuzzy string matching (>85% = typos)
 Layer 3: Vector similarity + LLM verification (semantic matches)
 Uses the same LLM that LightRAG is configured with.
 """
 from collections.abc import Awaitable, Callable
 from dataclasses import dataclass
 from difflib import SequenceMatcher
 import numpy as np
 from lightrag.utils import logger
 from .config import DEFAULT_CONFIG, EntityResolutionConfig
@dataclass
 class ResolutionResult:
    """Result of entity resolution attempt."""
    action: str  # "match" | "new"
    matched_entity: str | None
    confidence: float
    method: str  # "exact" | "fuzzy" | "llm" | "none" | "disabled"
 def cosine_similarity(a: list[float], b: list[float]) -> float:
    """Calculate cosine similarity between two vectors."""
    a_arr, b_arr = np.array(a), np.array(b)
    norm_a, norm_b = np.linalg.norm(a_arr), np.linalg.norm(b_arr)
    if norm_a == 0 or norm_b == 0:
        return 0.0
    return float(np.dot(a_arr, b_arr) / (norm_a * norm_b))
 def fuzzy_similarity(a: str, b: str) -> float:
    """Calculate fuzzy string similarity (0-1)."""
    return SequenceMatcher(None, a.lower().strip(), b.lower().strip()).ratio()
 def find_vector_candidates(
    query_embedding: list[float],
    existing_entities: list[tuple[str, list[float]]],
    threshold: float,
 ) -> list[tuple[str, float]]:
    """Find entities with vector similarity above threshold."""
    candidates = []
    for name, embedding in existing_entities:
        sim = cosine_similarity(query_embedding, embedding)
        if sim >= threshold:
            candidates.append((name, sim))
    # Sort by similarity descending
    candidates.sort(key=lambda x: x[1], reverse=True)
    return candidates
 async def llm_verify(
    term_a: str,
    term_b: str,
    llm_fn: Callable[[str], Awaitable[str]],
    prompt_template: str,
 ) -> bool:
    """Ask LLM if two terms refer to the same entity.
    Uses strict parsing with exact token matching only. Accepted responses:
    - Positive: "YES", "TRUE", "SAME", "MATCH"
    - Negative: "NO", "FALSE", "DIFFERENT", "NOT SAME"
    Any other response defaults to False to avoid false positive merges.
    """
    prompt = prompt_template.format(term_a=term_a, term_b=term_b)
    response = await llm_fn(prompt)
    # Normalize response: strip whitespace, take first line only
    normalized = response.strip().split("\n")[0].strip().upper()
    # Remove common trailing punctuation
    normalized = normalized.rstrip(".!,")
    # Only accept exact tokens (no prefix/substring matching)
    if normalized in ("YES", "TRUE", "SAME", "MATCH"):
        return True
    if normalized in ("NO", "FALSE", "DIFFERENT", "NOT SAME"):
        return False
    # Default to False for ambiguous responses (safer than false positive)
    return False
 async def resolve_entity(
    entity_name: str,
    existing_entities: list[tuple[str, list[float]]],
    embed_fn: Callable[[str], Awaitable[list[float]]],
    llm_fn: Callable[[str], Awaitable[str]],
    config: EntityResolutionConfig = DEFAULT_CONFIG,
 ) -> ResolutionResult:
    """Resolve an entity against existing entities using 3-layer approach.
    Args:
        entity_name: The new entity name to resolve
        existing_entities: List of (name, embedding) tuples for existing entities
        embed_fn: Async function to get embedding for a string (same as LightRAG uses)
        llm_fn: Async function to query LLM (same as LightRAG uses)
        config: Resolution configuration
    Returns:
        ResolutionResult with action ("match" or "new"), matched entity,
        confidence, and method used.
    """
    if not config.enabled:
        return ResolutionResult("new", None, 0.0, "disabled")
    if not existing_entities:
        return ResolutionResult("new", None, 0.0, "none")
    normalized = entity_name.lower().strip()
    # Layer 1: Case-insensitive exact match
    for name, _ in existing_entities:
        if name.lower().strip() == normalized:
            return ResolutionResult("match", name, 1.0, "exact")
    # Layer 2: Fuzzy string matching (catches typos)
    best_fuzzy_match = None
    best_fuzzy_score = 0.0
    for name, _ in existing_entities:
        similarity = fuzzy_similarity(entity_name, name)
        if similarity > best_fuzzy_score:
            best_fuzzy_score = similarity
            best_fuzzy_match = name
    if best_fuzzy_score >= config.fuzzy_threshold:
        return ResolutionResult("match", best_fuzzy_match, best_fuzzy_score, "fuzzy")
    # Layer 3: Vector similarity + LLM verification
    embedding = await embed_fn(entity_name)
    candidates = find_vector_candidates(
        embedding,
        existing_entities,
        config.vector_threshold,
    )
    # Verify top candidates with LLM
    for candidate_name, similarity in candidates[: config.max_candidates]:
        is_same = await llm_verify(
            entity_name,
            candidate_name,
            llm_fn,
            config.llm_prompt_template,
        )
        if is_same:
            return ResolutionResult("match", candidate_name, similarity, "llm")
    # No match found - this is a new entity
    return ResolutionResult("new", None, 0.0, "none")
 async def resolve_entity_with_vdb(
    entity_name: str,
    entity_vdb,  # BaseVectorStorage - imported dynamically to avoid circular imports
    llm_fn: Callable[[str], Awaitable[str]],
    config: EntityResolutionConfig = DEFAULT_CONFIG,
 ) -> ResolutionResult:
    """Resolve an entity using VDB for similarity search.
    This is the production integration that uses LightRAG's vector database
    directly instead of requiring pre-computed embeddings.
    Args:
        entity_name: The new entity name to resolve
        entity_vdb: LightRAG's entity vector database (BaseVectorStorage)
        llm_fn: Async function to query LLM (same as LightRAG uses)
        config: Resolution configuration
    Returns:
        ResolutionResult with action ("match" or "new"), matched entity,
        confidence, and method used.
    """
    if not config.enabled:
        return ResolutionResult("new", None, 0.0, "disabled")
    if entity_vdb is None:
        return ResolutionResult("new", None, 0.0, "none")
    normalized = entity_name.lower().strip()
    # Query VDB for similar entities - cast wide net, LLM will verify
    # top_k is doubled to have enough candidates after filtering
    try:
        candidates = await entity_vdb.query(
            entity_name, top_k=config.max_candidates * 3
        )
    except Exception as e:
        # Log and skip resolution if VDB query fails
        logger.debug(f"VDB query failed for '{entity_name}': {e}")
        return ResolutionResult("new", None, 0.0, "none")
    if not candidates:
        return ResolutionResult("new", None, 0.0, "none")
    # Layer 1: Case-insensitive exact match among candidates
    for candidate in candidates:
        candidate_name = candidate.get("entity_name")
        if candidate_name and candidate_name.lower().strip() == normalized:
            return ResolutionResult("match", candidate_name, 1.0, "exact")
    # Layer 2: Fuzzy string matching (catches typos)
    best_fuzzy_match = None
    best_fuzzy_score = 0.0
    for candidate in candidates:
        candidate_name = candidate.get("entity_name")
        if not candidate_name:
            continue
        similarity = fuzzy_similarity(entity_name, candidate_name)
        if similarity > best_fuzzy_score:
            best_fuzzy_score = similarity
            best_fuzzy_match = candidate_name
    if best_fuzzy_score >= config.fuzzy_threshold:
        return ResolutionResult("match", best_fuzzy_match, best_fuzzy_score, "fuzzy")
    # Layer 3: LLM verification on top candidates
    verified_count = 0
    for candidate in candidates:
        if verified_count >= config.max_candidates:
            break
        candidate_name = candidate.get("entity_name")
        if not candidate_name:
            continue
        is_same = await llm_verify(
            entity_name,
            candidate_name,
            llm_fn,
            config.llm_prompt_template,
        )
        verified_count += 1
        if is_same:
            # Use distance from VDB if available (converted to similarity)
            similarity = 0.7  # Default confidence for LLM match
            return ResolutionResult("match", candidate_name, similarity, "llm")
    # No match found - this is a new entity
    return ResolutionResult("new", None, 0.0, "none")
 # --- Alias Cache Functions (PostgreSQL) ---
 async def get_cached_alias(
    alias: str,
    db,  # PostgresDB instance
    workspace: str,
 ) -> tuple[str, str, float] | None:
    """Check if alias is already resolved in cache.
    Args:
        alias: The entity name to look up
        db: PostgresDB instance with query method
        workspace: Workspace for isolation
    Returns:
        Tuple of (canonical_entity, method, confidence) if found, None otherwise
    """
    import logging
    from lightrag.kg.postgres_impl import SQL_TEMPLATES
    logger = logging.getLogger(__name__)
    normalized_alias = alias.lower().strip()
    sql = SQL_TEMPLATES["get_alias"]
    try:
        result = await db.query(sql, params=[workspace, normalized_alias])
        if result:
            return (
                result["canonical_entity"],
                result["method"],
                result["confidence"],
            )
    except Exception as e:
        logger.debug(f"Alias cache lookup error: {e}")
    return None
 async def store_alias(
    alias: str,
    canonical: str,
    method: str,
    confidence: float,
    db,  # PostgresDB instance
    workspace: str,
 ) -> None:
    """Store a resolution in the alias cache.
    Args:
        alias: The variant name (e.g., "FDA")
        canonical: The resolved canonical name (e.g., "US Food and Drug Administration")
        method: How it was resolved ('exact', 'fuzzy', 'llm', 'manual')
        confidence: Resolution confidence (0-1)
        db: PostgresDB instance with execute method
        workspace: Workspace for isolation
    """
    import logging
    from datetime import datetime, timezone
    from lightrag.kg.postgres_impl import SQL_TEMPLATES
    logger = logging.getLogger(__name__)
    normalized_alias = alias.lower().strip()
    # Don't store self-referential aliases (e.g., "FDA" → "FDA")
    if normalized_alias == canonical.lower().strip():
        return
    sql = SQL_TEMPLATES["upsert_alias"]
    try:
        await db.execute(
            sql,
            data={
                "workspace": workspace,
                "alias": normalized_alias,
                "canonical_entity": canonical,
                "method": method,
                "confidence": confidence,
                "create_time": datetime.now(timezone.utc).replace(tzinfo=None),
            },
        )
    except Exception as e:
        logger.debug(f"Alias cache store error: {e}")
--- a/lightrag/kg/postgres_impl.py
+++ b/lightrag/kg/postgres_impl.py
@ -1130,7 +1130,14 @@ class PostgreSQLDB:
                existing_indexes = {row["indexname"] for row in existing_indexes_result}
            # Create missing indexes
            # Tables that don't have an 'id' column (use different primary key structure)
            tables_without_id = {"LIGHTRAG_ENTITY_ALIASES"}
            for k in table_names:
                # Skip tables that don't have an 'id' column
                if k in tables_without_id:
                    continue
                # Create index for id column if missing
                index_name = f"idx_{k.lower()}_id"
                if index_name not in existing_indexes:
@ -1259,6 +1266,12 @@ class PostgreSQLDB:
                f"PostgreSQL, Failed to create full entities/relations tables: {e}"
            )
        # Migrate entity aliases table to add update_time, index on canonical_entity, and confidence constraint
        try:
            await self._migrate_entity_aliases_schema()
        except Exception as e:
            logger.error(f"PostgreSQL, Failed to migrate entity aliases schema: {e}")
    async def _migrate_create_full_entities_relations_tables(self):
        """Create LIGHTRAG_FULL_ENTITIES and LIGHTRAG_FULL_RELATIONS tables if they don't exist"""
        tables_to_check = [
@ -1323,6 +1336,127 @@ class PostgreSQLDB:
            except Exception as e:
                logger.error(f"Failed to create table {table_name}: {e}")
    async def _migrate_entity_aliases_schema(self):
        """Migrate LIGHTRAG_ENTITY_ALIASES table to add update_time column, canonical index, and confidence constraint"""
        table_name = "LIGHTRAG_ENTITY_ALIASES"
        # Check if table exists first
        check_table_sql = """
        SELECT table_name
        FROM information_schema.tables
        WHERE table_name = $1
        AND table_schema = 'public'
        """
        table_exists = await self.query(check_table_sql, [table_name.lower()])
        if not table_exists:
            logger.debug(f"Table {table_name} does not exist yet, skipping migration")
            return
        # 1. Add update_time column if it doesn't exist
        check_column_sql = """
        SELECT column_name
        FROM information_schema.columns
        WHERE table_name = $1
        AND column_name = 'update_time'
        AND table_schema = 'public'
        """
        column_exists = await self.query(check_column_sql, [table_name.lower()])
        if not column_exists:
            try:
                # Three-step migration to add update_time column:
                # 1. Add column WITHOUT default - avoids full table rewrite on large tables
                # 2. Backfill existing rows with create_time values
                # 3. Set DEFAULT for future inserts
                # Note: There's a tiny race window between steps 1-3 where concurrent
                # inserts could get NULL. This is acceptable for this migration use case.
                #
                # Step 1: Add column WITHOUT default (existing rows get NULL)
                add_column_sql = f"""
                ALTER TABLE {table_name}
                ADD COLUMN update_time TIMESTAMP(0)
                """
                await self.execute(add_column_sql)
                logger.info(f"PostgreSQL, Added update_time column to {table_name}")
                # Step 2: Set existing rows' update_time to their create_time
                update_sql = f"""
                UPDATE {table_name}
                SET update_time = create_time
                WHERE update_time IS NULL
                """
                await self.execute(update_sql)
                logger.info(
                    f"PostgreSQL, Initialized update_time values in {table_name}"
                )
                # Step 3: Set default for future rows
                set_default_sql = f"""
                ALTER TABLE {table_name}
                ALTER COLUMN update_time SET DEFAULT CURRENT_TIMESTAMP
                """
                await self.execute(set_default_sql)
                logger.info(
                    f"PostgreSQL, Set default for update_time column in {table_name}"
                )
            except Exception as e:
                logger.error(
                    f"PostgreSQL, Failed to add update_time column to {table_name}: {e}"
                )
        # 2. Create index on (workspace, canonical_entity) for get_aliases_for_canonical query
        index_name = "idx_lightrag_entity_aliases_canonical"
        check_index_sql = """
        SELECT indexname
        FROM pg_indexes
        WHERE tablename = $1
        AND indexname = $2
        """
        index_exists = await self.query(
            check_index_sql, [table_name.lower(), index_name]
        )
        if not index_exists:
            try:
                create_index_sql = f"""
                CREATE INDEX {index_name}
                ON {table_name} (workspace, canonical_entity)
                """
                await self.execute(create_index_sql)
                logger.info(f"PostgreSQL, Created index {index_name} on {table_name}")
            except Exception as e:
                logger.error(f"PostgreSQL, Failed to create index {index_name}: {e}")
        # 3. Add CHECK constraint for confidence range if it doesn't exist
        constraint_name = "confidence_range"
        check_constraint_sql = """
        SELECT constraint_name
        FROM information_schema.table_constraints
        WHERE table_name = $1
        AND constraint_name = $2
        AND constraint_type = 'CHECK'
        AND table_schema = 'public'
        """
        constraint_exists = await self.query(
            check_constraint_sql, [table_name.lower(), constraint_name]
        )
        if not constraint_exists:
            try:
                add_constraint_sql = f"""
                ALTER TABLE {table_name}
                ADD CONSTRAINT {constraint_name}
                CHECK (confidence >= 0 AND confidence <= 1)
                """
                await self.execute(add_constraint_sql)
                logger.info(
                    f"PostgreSQL, Added CHECK constraint {constraint_name} to {table_name}"
                )
            except Exception as e:
                logger.warning(
                    f"PostgreSQL, Failed to add CHECK constraint {constraint_name} to {table_name}: {e}"
                )
    async def _create_pagination_indexes(self):
        """Create indexes to optimize pagination queries for LIGHTRAG_DOC_STATUS"""
        indexes = [
@ -1402,7 +1536,7 @@ class PostgreSQLDB:
            "VCHORDRQ": f"""
                CREATE INDEX {{vector_index_name}}
                ON {{k}} USING vchordrq (content_vector vector_cosine_ops)
-                {f'WITH (options = $${self.vchordrq_build_options}$$)' if self.vchordrq_build_options else ''}
+                {f"WITH (options = $${self.vchordrq_build_options}$$)" if self.vchordrq_build_options else ""}
            """,
        }
@ -4906,6 +5040,19 @@ TABLES = {
                    CONSTRAINT LIGHTRAG_RELATION_CHUNKS_PK PRIMARY KEY (workspace, id)
                    )"""
    },
    "LIGHTRAG_ENTITY_ALIASES": {
        "ddl": """CREATE TABLE LIGHTRAG_ENTITY_ALIASES (
                    workspace VARCHAR(255),
                    alias VARCHAR(512),
                    canonical_entity VARCHAR(512),
                    method VARCHAR(50),
                    confidence FLOAT,
                    create_time TIMESTAMP(0) DEFAULT CURRENT_TIMESTAMP,
                    update_time TIMESTAMP(0) DEFAULT CURRENT_TIMESTAMP,
                    CONSTRAINT LIGHTRAG_ENTITY_ALIASES_PK PRIMARY KEY (workspace, alias),
                    CONSTRAINT confidence_range CHECK (confidence >= 0 AND confidence <= 1)
                    )"""
    },
 }
@ -5117,4 +5264,25 @@ SQL_TEMPLATES = {
    "drop_specifiy_table_workspace": """
        DELETE FROM {table_name} WHERE workspace=$1
       """,
    # Entity alias cache
    "get_alias": """
        SELECT canonical_entity, method, confidence
        FROM LIGHTRAG_ENTITY_ALIASES
        WHERE workspace=$1 AND alias=$2
        """,
    "upsert_alias": """
        INSERT INTO LIGHTRAG_ENTITY_ALIASES
            (workspace, alias, canonical_entity, method, confidence, create_time, update_time)
        VALUES ($1, $2, $3, $4, $5, $6, $6)
        ON CONFLICT (workspace, alias) DO UPDATE SET
            canonical_entity = EXCLUDED.canonical_entity,
            method = EXCLUDED.method,
            confidence = EXCLUDED.confidence,
            update_time = CURRENT_TIMESTAMP
        """,
    "get_aliases_for_canonical": """
        SELECT alias, method, confidence
        FROM LIGHTRAG_ENTITY_ALIASES
        WHERE workspace=$1 AND canonical_entity=$2
        """,
 }
--- a/lightrag/lightrag.py
+++ b/lightrag/lightrag.py
@ -26,6 +26,7 @@ from typing import (
 )
 from lightrag.prompt import PROMPTS
 from lightrag.exceptions import PipelineCancelledException
 from lightrag.entity_resolution import EntityResolutionConfig
 from lightrag.constants import (
    DEFAULT_MAX_GLEANING,
    DEFAULT_FORCE_LLM_SUMMARY_ON_MERGE,
@ -217,6 +218,11 @@ class LightRAG:
        )
    )
    entity_resolution_config: EntityResolutionConfig | None = field(default=None)
    """Configuration for entity resolution (deduplication).
    Set to EntityResolutionConfig() to enable, or None to disable.
    Resolves entities like 'FDA' → 'US Food and Drug Administration'."""
    # Text chunking
    # ---
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@ -5,6 +5,7 @@ from pathlib import Path
 import asyncio
 import json
 import json_repair
 import re
 from typing import Any, AsyncIterator, overload, Literal
 from collections import Counter, defaultdict
@ -50,6 +51,13 @@ from lightrag.base import (
    QueryContextResult,
 )
 from lightrag.prompt import PROMPTS
 from lightrag.entity_resolution import (
    resolve_entity_with_vdb,
    get_cached_alias,
    store_alias,
    fuzzy_similarity,
    EntityResolutionConfig,
 )
 from lightrag.constants import (
    GRAPH_FIELD_SEP,
    DEFAULT_MAX_ENTITY_TOKENS,
@ -1590,6 +1598,180 @@ async def _rebuild_single_relationship(
            pipeline_status["history_messages"].append(status_message)
 def _has_different_numeric_suffix(name_a: str, name_b: str) -> bool:
    """Check if two names have different numeric components.
    This prevents false fuzzy matches between entities that differ only by number,
    such as "Interleukin-4" vs "Interleukin-13" (88.9% similar but semantically distinct).
    Scientific/medical entities often use numbers as key identifiers:
    - Interleukins: IL-4, IL-13, IL-17
    - Drug phases: Phase 1, Phase 2, Phase 3
    - Receptor types: Type 1, Type 2
    - Versions: v1.0, v2.0
    Args:
        name_a: First entity name
        name_b: Second entity name
    Returns:
        True if both names contain numbers but the numbers differ, False otherwise.
    """
    # Extract all numeric patterns (integers and decimals)
    pattern = r"(\d+(?:\.\d+)?)"
    nums_a = re.findall(pattern, name_a)
    nums_b = re.findall(pattern, name_b)
    # If both have numbers and they differ, these are likely distinct entities
    if nums_a and nums_b and nums_a != nums_b:
        return True
    return False
 async def _build_pre_resolution_map(
    entity_names: list[str],
    entity_types: dict[str, str],
    entity_vdb,
    llm_fn,
    config: EntityResolutionConfig,
 ) -> tuple[dict[str, str], dict[str, float]]:
    """Build resolution map before parallel processing to prevent race conditions.
    This function resolves entities against each other within the batch (using
    instant fuzzy matching) and against existing VDB entries. The resulting map
    is applied during parallel entity processing.
    Args:
        entity_names: List of entity names to resolve
        entity_types: Dict mapping entity names to their types (e.g., "person", "organization").
                      Used to prevent fuzzy matching between entities of different types.
        entity_vdb: Entity vector database for checking existing entities
        llm_fn: LLM function for semantic verification
        config: Entity resolution configuration
    Returns:
        Tuple of:
        - resolution_map: Dict mapping original entity names to their resolved canonical names.
          Only entities that need remapping are included.
        - confidence_map: Dict mapping alias to confidence score (1.0 for exact, actual
          similarity for fuzzy, result.confidence for VDB matches).
    """
    resolution_map: dict[str, str] = {}
    confidence_map: dict[str, float] = {}
    # Track canonical entities with their types: [(name, type), ...]
    canonical_entities: list[tuple[str, str]] = []
    for entity_name in entity_names:
        normalized = entity_name.lower().strip()
        entity_type = entity_types.get(entity_name, "")
        # Skip if already resolved to something in this batch
        if entity_name in resolution_map:
            continue
        # Layer 1: Case-insensitive exact match within batch
        matched = False
        for canonical, canonical_type in canonical_entities:
            if canonical.lower().strip() == normalized:
                resolution_map[entity_name] = canonical
                confidence_map[entity_name] = 1.0  # Exact match = perfect confidence
                logger.debug(
                    f"Pre-resolution (case match): '{entity_name}' → '{canonical}'"
                )
                matched = True
                break
        if matched:
            continue
        # Layer 2: Fuzzy match within batch (catches typos like Dupixant→Dupixent)
        # Only enabled when config.fuzzy_pre_resolution_enabled is True.
        # Requires: similarity >= threshold AND matching types (or unknown).
        if config.fuzzy_pre_resolution_enabled:
            for canonical, canonical_type in canonical_entities:
                similarity = fuzzy_similarity(entity_name, canonical)
                if similarity >= config.fuzzy_threshold:
                    # Type compatibility check: skip if types differ and both known.
                    # Empty/unknown types are treated as compatible to avoid
                    # blocking legitimate matches when type info is incomplete.
                    types_compatible = (
                        not entity_type
                        or not canonical_type
                        or entity_type == canonical_type
                    )
                    if not types_compatible:
                        logger.debug(
                            f"Pre-resolution (fuzzy {similarity:.2f}): SKIPPED "
                            f"'{entity_name}' ({entity_type}) → "
                            f"'{canonical}' ({canonical_type}) - type mismatch"
                        )
                        continue
                    # Numeric suffix check: skip if names have different numbers
                    # This prevents false matches like "Interleukin-4" → "Interleukin-13"
                    # where fuzzy similarity is high (88.9%) but entities are distinct
                    if _has_different_numeric_suffix(entity_name, canonical):
                        logger.debug(
                            f"Pre-resolution (fuzzy {similarity:.2f}): SKIPPED "
                            f"'{entity_name}' → '{canonical}' - different numeric suffix"
                        )
                        continue
                    # Accept the fuzzy match - emit warning for review
                    resolution_map[entity_name] = canonical
                    confidence_map[entity_name] = (
                        similarity  # Use actual similarity score
                    )
                    etype_display = entity_type or "unknown"
                    ctype_display = canonical_type or "unknown"
                    logger.warning(
                        f"Fuzzy pre-resolution accepted: '{entity_name}' → "
                        f"'{canonical}' (similarity={similarity:.3f}, "
                        f"types: {etype_display}→{ctype_display}). "
                        f"Review for correctness; adjust fuzzy_threshold or "
                        f"disable fuzzy_pre_resolution_enabled if needed."
                    )
                    matched = True
                    break
        if matched:
            continue
        # Layer 3: Check existing VDB for cross-document deduplication
        if entity_vdb and llm_fn:
            try:
                result = await resolve_entity_with_vdb(
                    entity_name, entity_vdb, llm_fn, config
                )
                if result.action == "match" and result.matched_entity:
                    resolution_map[entity_name] = result.matched_entity
                    confidence_map[entity_name] = (
                        result.confidence
                    )  # Use VDB result confidence
                    # Add canonical from VDB so batch entities can match it.
                    # VDB matches don't have type info available, use empty.
                    canonical_entities.append((result.matched_entity, ""))
                    logger.debug(
                        f"Pre-resolution (VDB {result.method}): "
                        f"'{entity_name}' → '{result.matched_entity}'"
                    )
                    continue
            except Exception as e:
                logger.debug(
                    f"Pre-resolution VDB check failed for '{entity_name}': {e}"
                )
        # No match found - this is a new canonical entity
        canonical_entities.append((entity_name, entity_type))
    if resolution_map:
        logger.info(
            f"Pre-resolution: {len(resolution_map)} entities mapped to canonical forms"
        )
    return resolution_map, confidence_map
 async def _merge_nodes_then_upsert(
    entity_name: str,
    nodes_data: list[dict],
@ -1600,8 +1782,128 @@ async def _merge_nodes_then_upsert(
    pipeline_status_lock=None,
    llm_response_cache: BaseKVStorage | None = None,
    entity_chunks_storage: BaseKVStorage | None = None,
-):
+    pre_resolution_map: dict[str, str] | None = None,
-    """Get existing nodes from knowledge graph use name,if exists, merge data, else create, then upsert."""
+) -> tuple[dict, str | None]:
    """Get existing nodes from knowledge graph use name,if exists, merge data, else create, then upsert.
    Returns:
        Tuple of (node_data, original_entity_name). original_entity_name is set if
        entity resolution changed the name (e.g., "Dupixant" → "Dupixent"),
        otherwise None.
    """
    original_entity_name = entity_name  # Track original before resolution
    # Apply pre-resolution map immediately (prevents race conditions in parallel processing)
    pre_resolved = False
    if pre_resolution_map and entity_name in pre_resolution_map:
        entity_name = pre_resolution_map[entity_name]
        pre_resolved = True
        logger.debug(
            f"Applied pre-resolution: '{original_entity_name}' → '{entity_name}'"
        )
    # Entity Resolution: Resolve new entity against existing entities
    # Skip if already pre-resolved (to avoid redundant VDB queries)
    entity_resolution_config_raw = global_config.get("entity_resolution_config")
    entity_resolution_config = None
    if entity_resolution_config_raw:
        # Handle both dict (from asdict() serialization) and EntityResolutionConfig instances
        if isinstance(entity_resolution_config_raw, EntityResolutionConfig):
            entity_resolution_config = entity_resolution_config_raw
        elif isinstance(entity_resolution_config_raw, dict):
            try:
                entity_resolution_config = EntityResolutionConfig(
                    **entity_resolution_config_raw
                )
            except TypeError as e:
                logger.warning(
                    f"Invalid entity_resolution_config: {e}. "
                    f"Config: {entity_resolution_config_raw}. Skipping resolution."
                )
    # Safely check if entity resolution is enabled, handling both object and dict forms
    def _is_resolution_enabled(config) -> bool:
        if config is None:
            return False
        if isinstance(config, dict):
            return config.get("enabled", False)
        return getattr(config, "enabled", False)
    # Skip VDB resolution if entity was already pre-resolved (prevents redundant queries)
    if (
        _is_resolution_enabled(entity_resolution_config)
        and entity_vdb
        and not pre_resolved
    ):
        original_name = entity_name
        workspace = global_config.get("workspace", "")
        # Try knowledge_graph_inst.db first (more reliable), fallback to entity_vdb.db
        db = getattr(knowledge_graph_inst, "db", None) or getattr(
            entity_vdb, "db", None
        )
        # Layer 0: Check alias cache first (PostgreSQL-only - requires db connection)
        # Note: Alias caching is only available when using PostgreSQL storage backend
        if db is not None:
            try:
                cached = await get_cached_alias(original_name, db, workspace)
                if cached:
                    canonical, method, _ = cached
                    logger.debug(
                        f"Alias cache hit: '{original_name}' → '{canonical}' "
                        f"(method: {method})"
                    )
                    entity_name = canonical
            except Exception as e:
                logger.warning(
                    f"Entity resolution cache lookup failed for '{original_name}' "
                    f"(workspace: {workspace}): {type(e).__name__}: {e}. "
                    "Continuing without cache."
                )
        # Layers 1-3: Full VDB resolution (if not found in cache)
        if entity_name == original_name:
            llm_fn = global_config.get("llm_model_func")
            if llm_fn:
                try:
                    resolution = await resolve_entity_with_vdb(
                        entity_name,
                        entity_vdb,
                        llm_fn,
                        entity_resolution_config,
                    )
                    if resolution.action == "match" and resolution.matched_entity:
                        logger.info(
                            f"Entity resolution: '{entity_name}' → '{resolution.matched_entity}' "
                            f"(method: {resolution.method}, confidence: {resolution.confidence:.2f})"
                        )
                        entity_name = resolution.matched_entity
                        # Store in alias cache for next time (PostgreSQL-only)
                        # Note: Alias caching requires PostgreSQL storage backend
                        if db is not None:
                            try:
                                await store_alias(
                                    original_name,
                                    entity_name,
                                    resolution.method,
                                    resolution.confidence,
                                    db,
                                    workspace,
                                )
                            except Exception as e:
                                logger.warning(
                                    f"Failed to store entity alias '{original_name}' → '{entity_name}' "
                                    f"(workspace: {workspace}): {type(e).__name__}: {e}. "
                                    "Resolution succeeded but cache not updated."
                                )
                except Exception as e:
                    logger.warning(
                        f"Entity resolution failed for '{original_name}' "
                        f"(workspace: {workspace}): {type(e).__name__}: {e}. "
                        "Continuing with original entity name."
                    )
    already_entity_types = []
    already_source_ids = []
    already_description = []
@ -1865,7 +2167,12 @@ async def _merge_nodes_then_upsert(
            max_retries=3,
            retry_delay=0.1,
        )
-    return node_data
+
    # Return original name if resolution changed it, None otherwise
    resolved_from = (
        original_entity_name if entity_name != original_entity_name else None
    )
    return node_data, resolved_from
 async def _merge_edges_then_upsert(
@ -1882,7 +2189,13 @@ async def _merge_edges_then_upsert(
    added_entities: list = None,  # New parameter to track entities added during edge processing
    relation_chunks_storage: BaseKVStorage | None = None,
    entity_chunks_storage: BaseKVStorage | None = None,
    entity_resolution_map: dict[str, str] | None = None,  # Map original→resolved names
 ):
    # Apply entity resolution mapping to edge endpoints
    if entity_resolution_map:
        src_id = entity_resolution_map.get(src_id, src_id)
        tgt_id = entity_resolution_map.get(tgt_id, tgt_id)
    if src_id == tgt_id:
        return None
@ -2472,6 +2785,76 @@ async def merge_nodes_and_edges(
    graph_max_async = global_config.get("llm_model_max_async", 4) * 2
    semaphore = asyncio.Semaphore(graph_max_async)
    # ===== Pre-Resolution Phase: Build entity resolution map =====
    # This prevents race conditions when parallel workers process similar entities
    # IMPORTANT: Include BOTH entity names AND relation endpoints to catch all duplicates
    pre_resolution_map: dict[str, str] = {}
    entity_resolution_config_raw = global_config.get("entity_resolution_config")
    if entity_resolution_config_raw:
        # Handle both dict (from asdict() serialization) and EntityResolutionConfig instances
        config = None
        if isinstance(entity_resolution_config_raw, EntityResolutionConfig):
            config = entity_resolution_config_raw
        elif isinstance(entity_resolution_config_raw, dict):
            try:
                config = EntityResolutionConfig(**entity_resolution_config_raw)
            except TypeError as e:
                logger.warning(
                    f"Invalid entity_resolution_config: {e}. "
                    f"Config: {entity_resolution_config_raw}. Skipping resolution."
                )
        if config and config.enabled:
            llm_fn = global_config.get("llm_model_func")
            # Build entity_types map for type-aware fuzzy matching.
            # Use first non-empty type for entities with multiple occurrences.
            entity_types: dict[str, str] = {}
            for entity_name, entities in all_nodes.items():
                for entity_data in entities:
                    etype = entity_data.get("entity_type", "")
                    if etype:
                        entity_types[entity_name] = etype
                        break
            # Collect ALL entity names: from entities AND from relation endpoints
            # This ensures relation endpoints like "EU Medicines Agency" get resolved
            # against existing entities like "European Medicines Agency"
            all_entity_names = set(all_nodes.keys())
            for src_id, tgt_id in all_edges.keys():
                all_entity_names.add(src_id)
                all_entity_names.add(tgt_id)
            pre_resolution_map, confidence_map = await _build_pre_resolution_map(
                list(all_entity_names),
                entity_types,
                entity_vdb,
                llm_fn,
                config,
            )
            # Cache pre-resolution aliases for future lookups (PostgreSQL-only)
            # This ensures aliases discovered during batch processing are available
            # for subsequent document ingestion without re-running resolution
            db = getattr(knowledge_graph_inst, "db", None)
            if db is not None and pre_resolution_map:
                workspace = global_config.get("workspace", "")
                for alias, canonical in pre_resolution_map.items():
                    # Don't cache self-references (entity → itself)
                    if alias.lower().strip() != canonical.lower().strip():
                        try:
                            await store_alias(
                                alias=alias,
                                canonical=canonical,
                                method="pre_resolution",
                                confidence=confidence_map.get(alias, 1.0),
                                db=db,
                                workspace=workspace,
                            )
                        except Exception as e:
                            logger.debug(
                                f"Failed to cache pre-resolution alias "
                                f"'{alias}' → '{canonical}': {e}"
                            )
    # ===== Phase 1: Process all entities concurrently =====
    log_message = f"Phase 1: Processing {total_entities_count} entities from {doc_id} (async: {graph_max_async})"
    logger.info(log_message)
@ -2479,6 +2862,11 @@ async def merge_nodes_and_edges(
        pipeline_status["latest_message"] = log_message
        pipeline_status["history_messages"].append(log_message)
    # Resolution map to track original→resolved entity names (e.g., "Dupixant"→"Dupixent")
    # This will be used to remap edge endpoints in Phase 2
    entity_resolution_map: dict[str, str] = {}
    resolution_map_lock = asyncio.Lock()
    async def _locked_process_entity_name(entity_name, entities):
        async with semaphore:
            # Check for cancellation before processing entity
@ -2496,7 +2884,7 @@ async def merge_nodes_and_edges(
            ):
                try:
                    logger.debug(f"Processing entity {entity_name}")
-                    entity_data = await _merge_nodes_then_upsert(
+                    entity_data, resolved_from = await _merge_nodes_then_upsert(
                        entity_name,
                        entities,
                        knowledge_graph_inst,
@ -2506,8 +2894,15 @@ async def merge_nodes_and_edges(
                        pipeline_status_lock,
                        llm_response_cache,
                        entity_chunks_storage,
                        pre_resolution_map,
                    )
                    # Track resolution mapping for edge remapping in Phase 2
                    if resolved_from is not None:
                        resolved_to = entity_data.get("entity_name", entity_name)
                        async with resolution_map_lock:
                            entity_resolution_map[resolved_from] = resolved_to
                    return entity_data
                except Exception as e:
@ -2617,6 +3012,7 @@ async def merge_nodes_and_edges(
                        added_entities,  # Pass list to collect added entities
                        relation_chunks_storage,
                        entity_chunks_storage,  # Add entity_chunks_storage parameter
                        entity_resolution_map,  # Apply entity resolution to edge endpoints
                    )
                    if edge_data is None:
@ -2649,9 +3045,36 @@ async def merge_nodes_and_edges(
                    raise prefixed_exception from e
    # Create relationship processing tasks
-    edge_tasks = []
+    # Apply pre_resolution_map to edge endpoints to prevent duplicates from relation extraction
    # Key fixes: sort for lock ordering, filter self-loops, deduplicate merged edges
    resolved_edges: dict[tuple[str, str], list] = {}
    for edge_key, edges in all_edges.items():
-        task = asyncio.create_task(_locked_process_edges(edge_key, edges))
+        # Remap edge endpoints using pre-resolution map
        # This catches cases like "EU Medicines Agency" → "European Medicines Agency"
        resolved_src = pre_resolution_map.get(edge_key[0], edge_key[0])
        resolved_tgt = pre_resolution_map.get(edge_key[1], edge_key[1])
        # Skip self-loops created by resolution (e.g., both endpoints resolve to same entity)
        if resolved_src == resolved_tgt:
            logger.debug(
                f"Skipping self-loop after resolution: {edge_key} → ({resolved_src}, {resolved_tgt})"
            )
            continue
        # Sort for consistent lock ordering (prevents deadlocks)
        resolved_edge_key = tuple(sorted([resolved_src, resolved_tgt]))
        # Merge edges that resolve to same key (deduplication)
        if resolved_edge_key not in resolved_edges:
            resolved_edges[resolved_edge_key] = []
        resolved_edges[resolved_edge_key].extend(edges)
    # Create tasks from deduplicated edges
    edge_tasks = []
    for resolved_edge_key, merged_edges in resolved_edges.items():
        task = asyncio.create_task(
            _locked_process_edges(resolved_edge_key, merged_edges)
        )
        edge_tasks.append(task)
    # Execute relationship tasks with error handling
--- a/lightrag_webui/src/App.tsx
+++ b/lightrag_webui/src/App.tsx
@ -2,7 +2,6 @@ import { useState, useCallback, useEffect, useRef } from 'react'
 import ThemeProvider from '@/components/ThemeProvider'
 import TabVisibilityProvider from '@/contexts/TabVisibilityProvider'
 import ApiKeyAlert from '@/components/ApiKeyAlert'
 import StatusIndicator from '@/components/status/StatusIndicator'
 import { SiteInfo, webuiPrefix } from '@/lib/constants'
 import { useBackendState, useAuthStore } from '@/stores/state'
 import { useSettingsStore } from '@/stores/settings'
@ -218,7 +217,6 @@ function App() {
                </TabsContent>
              </div>
            </Tabs>
            {enableHealthCheck && <StatusIndicator />}
            <ApiKeyAlert open={apiKeyAlertOpen} onOpenChange={handleApiKeyAlertOpenChange} />
          </main>
        )}
--- a/lightrag_webui/src/components/AppSettings.tsx
+++ b/lightrag_webui/src/components/AppSettings.tsx
@ -1,9 +1,7 @@
-import { useState, useCallback } from 'react'
+import { useCallback } from 'react'
 import { Popover, PopoverContent, PopoverTrigger } from '@/components/ui/Popover'
 import Button from '@/components/ui/Button'
 import { Select, SelectContent, SelectItem, SelectTrigger, SelectValue } from '@/components/ui/Select'
 import { useSettingsStore } from '@/stores/settings'
-import { PaletteIcon } from 'lucide-react'
+import { SunIcon, MoonIcon } from 'lucide-react'
 import { useTranslation } from 'react-i18next'
 import { cn } from '@/lib/utils'
@ -12,63 +10,39 @@ interface AppSettingsProps {
 }
 export default function AppSettings({ className }: AppSettingsProps) {
  const [opened, setOpened] = useState<boolean>(false)
  const { t } = useTranslation()
  const language = useSettingsStore.use.language()
  const setLanguage = useSettingsStore.use.setLanguage()
  const theme = useSettingsStore.use.theme()
  const setTheme = useSettingsStore.use.setTheme()
-  const handleLanguageChange = useCallback((value: string) => {
+  // Compute effective theme for icon/tooltip display when theme is 'system'
-    setLanguage(value as 'en' | 'zh' | 'fr' | 'ar' | 'zh_TW')
+  const effectiveTheme = theme === 'system'
-  }, [setLanguage])
+    ? (window.matchMedia('(prefers-color-scheme: dark)').matches ? 'dark' : 'light')
    : theme
-  const handleThemeChange = useCallback((value: string) => {
+  const handleThemeToggle = useCallback(() => {
-    setTheme(value as 'light' | 'dark' | 'system')
+    if (theme === 'system') {
-  }, [setTheme])
+      // Detect actual system preference and toggle to opposite
      const isDark = window.matchMedia('(prefers-color-scheme: dark)').matches
      setTheme(isDark ? 'light' : 'dark')
    } else {
      setTheme(theme === 'dark' ? 'light' : 'dark')
    }
  }, [theme, setTheme])
  return (
-    <Popover open={opened} onOpenChange={setOpened}>
+    <Button
-      <PopoverTrigger asChild>
+      variant="ghost"
-        <Button variant="ghost" size="icon" className={cn('h-9 w-9', className)}>
+      size="icon"
-          <PaletteIcon className="h-5 w-5" />
+      className={cn('h-9 w-9', className)}
-        </Button>
+      onClick={handleThemeToggle}
-      </PopoverTrigger>
+      tooltip={effectiveTheme === 'dark' ? t('settings.light') : t('settings.dark')}
-      <PopoverContent side="bottom" align="end" className="w-56">
+    >
-        <div className="flex flex-col gap-4">
+      {effectiveTheme === 'dark' ? (
-          <div className="flex flex-col gap-2">
+        <MoonIcon className="h-5 w-5" />
-            <label className="text-sm font-medium">{t('settings.language')}</label>
+      ) : (
-            <Select value={language} onValueChange={handleLanguageChange}>
+        <SunIcon className="h-5 w-5" />
-              <SelectTrigger>
+      )}
-                <SelectValue />
+    </Button>
              </SelectTrigger>
              <SelectContent>
                <SelectItem value="en">English</SelectItem>
                <SelectItem value="zh">中文</SelectItem>
                <SelectItem value="fr">Français</SelectItem>
                <SelectItem value="ar">العربية</SelectItem>
                <SelectItem value="zh_TW">繁體中文</SelectItem>
              </SelectContent>
            </Select>
          </div>
          <div className="flex flex-col gap-2">
            <label className="text-sm font-medium">{t('settings.theme')}</label>
            <Select value={theme} onValueChange={handleThemeChange}>
              <SelectTrigger>
                <SelectValue />
              </SelectTrigger>
              <SelectContent>
                <SelectItem value="light">{t('settings.light')}</SelectItem>
                <SelectItem value="dark">{t('settings.dark')}</SelectItem>
                <SelectItem value="system">{t('settings.system')}</SelectItem>
              </SelectContent>
            </Select>
          </div>
        </div>
      </PopoverContent>
    </Popover>
  )
 }
--- a/lightrag_webui/src/components/status/StatusIndicator.tsx
+++ b/lightrag_webui/src/components/status/StatusIndicator.tsx
@ -4,7 +4,7 @@ import { useEffect, useState } from 'react'
 import StatusDialog from './StatusDialog'
 import { useTranslation } from 'react-i18next'
-const StatusIndicator = () => {
+const StatusIndicator = ({ className }: { className?: string }) => {
  const { t } = useTranslation()
  const health = useBackendState.use.health()
  const lastCheckTime = useBackendState.use.lastCheckTime()
@ -20,7 +20,7 @@ const StatusIndicator = () => {
  }, [lastCheckTime])
  return (
-    <div className="fixed right-4 bottom-4 flex items-center gap-2 opacity-80 select-none">
+    <div className={cn("flex items-center gap-2 opacity-80 select-none", className)}>
      <div
        className="flex cursor-pointer items-center gap-2"
        onClick={() => setDialogOpen(true)}
--- a/lightrag_webui/src/features/SiteHeader.tsx
+++ b/lightrag_webui/src/features/SiteHeader.tsx
@ -1,13 +1,14 @@
 import Button from '@/components/ui/Button'
-import { SiteInfo, webuiPrefix } from '@/lib/constants'
+import { webuiPrefix } from '@/lib/constants'
 import AppSettings from '@/components/AppSettings'
 import StatusIndicator from '@/components/status/StatusIndicator'
 import { TabsList, TabsTrigger } from '@/components/ui/Tabs'
 import { useSettingsStore } from '@/stores/settings'
 import { useAuthStore } from '@/stores/state'
 import { cn } from '@/lib/utils'
 import { useTranslation } from 'react-i18next'
 import { navigationService } from '@/services/navigation'
-import { ZapIcon, GithubIcon, LogOutIcon } from 'lucide-react'
+import { ZapIcon, LogOutIcon } from 'lucide-react'
 import { Tooltip, TooltipContent, TooltipProvider, TooltipTrigger } from '@/components/ui/Tooltip'
 interface NavigationTabProps {
@ -56,17 +57,8 @@ function TabsNavigation() {
 export default function SiteHeader() {
  const { t } = useTranslation()
-  const { isGuestMode, coreVersion, apiVersion, username, webuiTitle, webuiDescription } = useAuthStore()
+  const { isGuestMode, username, webuiTitle, webuiDescription } = useAuthStore()
-
+  const enableHealthCheck = useSettingsStore.use.enableHealthCheck()
  const versionDisplay = (coreVersion && apiVersion)
    ? `${coreVersion}/${apiVersion}`
    : null;
  // Check if frontend needs rebuild (apiVersion ends with warning symbol)
  const hasWarning = apiVersion?.endsWith('⚠️');
  const versionTooltip = hasWarning
    ? t('header.frontendNeedsRebuild')
    : versionDisplay ? `v${versionDisplay}` : '';
  const handleLogout = () => {
    navigationService.navigateToLogin();
@ -77,7 +69,6 @@ export default function SiteHeader() {
      <div className="min-w-[200px] w-auto flex items-center">
        <a href={webuiPrefix} className="flex items-center gap-2">
          <ZapIcon className="size-4 text-emerald-400" aria-hidden="true" />
          <span className="font-bold md:inline-block">{SiteInfo.name}</span>
        </a>
        {webuiTitle && (
          <div className="flex items-center">
@ -111,25 +102,7 @@ export default function SiteHeader() {
      <nav className="w-[200px] flex items-center justify-end">
        <div className="flex items-center gap-2">
-          {versionDisplay && (
+          {enableHealthCheck && <StatusIndicator />}
            <TooltipProvider>
              <Tooltip>
                <TooltipTrigger asChild>
                  <span className="text-xs text-gray-500 dark:text-gray-400 mr-1 cursor-default">
                    v{versionDisplay}
                  </span>
                </TooltipTrigger>
                <TooltipContent side="bottom">
                  {versionTooltip}
                </TooltipContent>
              </Tooltip>
            </TooltipProvider>
          )}
          <Button variant="ghost" size="icon" side="bottom" tooltip={t('header.projectRepository')}>
            <a href={SiteInfo.github} target="_blank" rel="noopener noreferrer">
              <GithubIcon className="size-4" aria-hidden="true" />
            </a>
          </Button>
          <AppSettings />
          {!isGuestMode && (
            <Button
--- a/lightrag_webui/src/stores/settings.ts
+++ b/lightrag_webui/src/stores/settings.ts
@ -87,7 +87,7 @@ interface SettingsState {
 const useSettingsStoreBase = create<SettingsState>()(
  persist(
    (set) => ({
-      theme: 'system',
+      theme: 'light',
      language: 'en',
      showPropertyPanel: true,
      showNodeSearchBar: true,
@ -238,7 +238,7 @@ const useSettingsStoreBase = create<SettingsState>()(
    {
      name: 'settings-storage',
      storage: createJSONStorage(() => localStorage),
-      version: 19,
+      version: 20,
      migrate: (state: any, version: number) => {
        if (version < 2) {
          state.showEdgeLabel = false
@ -341,6 +341,15 @@ const useSettingsStoreBase = create<SettingsState>()(
            delete state.querySettings.response_type
          }
        }
        if (version < 20) {
          // Only set defaults if values are missing (preserve user preference)
          if (!state.theme) {
            state.theme = 'light'
          }
          if (!state.language) {
            state.language = 'en'
          }
        }
        return state
      }
    }
--- a/tests/test_entity_resolution/init.py
+++ b/tests/test_entity_resolution/init.py
@ -0,0 +1 @@
 # Entity resolution tests
--- a/tests/test_entity_resolution/test_resolver.py
+++ b/tests/test_entity_resolution/test_resolver.py
@ -0,0 +1,240 @@
 """
 Unit tests for Entity Resolution
 Tests the 3-layer approach with mock embed_fn and llm_fn.
 No database or external services required.
 """
 import pytest
 from lightrag.entity_resolution import (
    EntityResolutionConfig,
    resolve_entity,
 )
 # Mock embeddings - pre-computed for test entities
 # These simulate what an embedding model would return
 MOCK_EMBEDDINGS = {
    # FDA and full name have ~0.67 similarity (based on real test)
    "fda": [0.1, 0.2, 0.3, 0.4, 0.5],
    "us food and drug administration": [0.15, 0.25, 0.28, 0.38, 0.52],
    # Dupixent and dupilumab have ~0.63 similarity
    "dupixent": [0.5, 0.6, 0.7, 0.8, 0.9],
    "dupilumab": [0.48, 0.58, 0.72, 0.78, 0.88],
    # Celebrex and Cerebyx are different (low similarity)
    "celebrex": [0.9, 0.1, 0.2, 0.3, 0.4],
    "cerebyx": [0.1, 0.9, 0.8, 0.7, 0.6],
    # Default for unknown entities
    "default": [0.0, 0.0, 0.0, 0.0, 0.0],
 }
 # Mock LLM responses
 MOCK_LLM_RESPONSES = {
    ("fda", "us food and drug administration"): "YES",
    ("us food and drug administration", "fda"): "YES",
    ("dupixent", "dupilumab"): "YES",
    ("dupilumab", "dupixent"): "YES",
    ("heart attack", "myocardial infarction"): "YES",
    ("celebrex", "cerebyx"): "NO",
    ("metformin", "metoprolol"): "NO",
 }
 async def mock_embed_fn(text: str) -> list[float]:
    """Mock embedding function."""
    key = text.lower().strip()
    return MOCK_EMBEDDINGS.get(key, MOCK_EMBEDDINGS["default"])
 async def mock_llm_fn(prompt: str) -> str:
    """Mock LLM function that parses the prompt and returns YES/NO."""
    # Extract term_a and term_b from the prompt
    lines = prompt.strip().split("\n")
    term_a = None
    term_b = None
    for line in lines:
        if line.startswith("Term A:"):
            term_a = line.replace("Term A:", "").strip().lower()
        elif line.startswith("Term B:"):
            term_b = line.replace("Term B:", "").strip().lower()
    if term_a and term_b:
        # Check both orderings
        response = MOCK_LLM_RESPONSES.get((term_a, term_b))
        if response is None:
            response = MOCK_LLM_RESPONSES.get((term_b, term_a), "NO")
        return response
    return "NO"
 # Test fixtures
@pytest.fixture
 def existing_entities():
    """Existing entities in the knowledge graph."""
    return [
        (
            "US Food and Drug Administration",
            MOCK_EMBEDDINGS["us food and drug administration"],
        ),
        ("Dupixent", MOCK_EMBEDDINGS["dupixent"]),
        ("Celebrex", MOCK_EMBEDDINGS["celebrex"]),
    ]
@pytest.fixture
 def config():
    """Default resolution config."""
    return EntityResolutionConfig()
 # Layer 1: Case normalization tests
 class TestCaseNormalization:
    @pytest.mark.asyncio
    async def test_exact_match_same_case(self, existing_entities, config):
        """Exact match with same case."""
        result = await resolve_entity(
            "Dupixent",
            existing_entities,
            mock_embed_fn,
            mock_llm_fn,
            config,
        )
        assert result.action == "match"
        assert result.matched_entity == "Dupixent"
        assert result.method == "exact"
        assert result.confidence == 1.0
    @pytest.mark.asyncio
    async def test_exact_match_different_case(self, existing_entities, config):
        """DUPIXENT should match Dupixent via case normalization."""
        result = await resolve_entity(
            "DUPIXENT",
            existing_entities,
            mock_embed_fn,
            mock_llm_fn,
            config,
        )
        assert result.action == "match"
        assert result.matched_entity == "Dupixent"
        assert result.method == "exact"
    @pytest.mark.asyncio
    async def test_exact_match_lowercase(self, existing_entities, config):
        """dupixent should match Dupixent."""
        result = await resolve_entity(
            "dupixent",
            existing_entities,
            mock_embed_fn,
            mock_llm_fn,
            config,
        )
        assert result.action == "match"
        assert result.method == "exact"
 # Layer 2: Fuzzy matching tests
 class TestFuzzyMatching:
    @pytest.mark.asyncio
    async def test_fuzzy_match_typo(self, existing_entities, config):
        """Dupixant (typo) should match Dupixent via fuzzy matching (88%)."""
        result = await resolve_entity(
            "Dupixant",
            existing_entities,
            mock_embed_fn,
            mock_llm_fn,
            config,
        )
        assert result.action == "match"
        assert result.matched_entity == "Dupixent"
        assert result.method == "fuzzy"
        assert result.confidence >= 0.85
    @pytest.mark.asyncio
    async def test_fuzzy_rejects_below_threshold(self, existing_entities, config):
        """Celebrex vs Cerebyx is 67% - should NOT fuzzy match."""
        # Add Cerebyx as the query (Celebrex exists)
        result = await resolve_entity(
            "Cerebyx",
            existing_entities,
            mock_embed_fn,
            mock_llm_fn,
            config,
        )
        # Should not be fuzzy match (67% < 85%)
        assert result.method != "fuzzy" or result.action == "new"
 # Layer 3: LLM verification tests
 class TestLLMVerification:
    @pytest.mark.asyncio
    async def test_llm_matches_acronym(self, existing_entities, config):
        """FDA should match US Food and Drug Administration via LLM."""
        result = await resolve_entity(
            "FDA",
            existing_entities,
            mock_embed_fn,
            mock_llm_fn,
            config,
        )
        assert result.action == "match"
        assert result.matched_entity == "US Food and Drug Administration"
        assert result.method == "llm"
    @pytest.mark.asyncio
    async def test_llm_matches_brand_generic(self, config):
        """Dupixent should match dupilumab via LLM."""
        existing = [
            ("dupilumab", MOCK_EMBEDDINGS["dupilumab"]),
        ]
        result = await resolve_entity(
            "Dupixent",
            existing,
            mock_embed_fn,
            mock_llm_fn,
            config,
        )
        assert result.action == "match"
        assert result.matched_entity == "dupilumab"
        assert result.method == "llm"
 # Edge cases
 class TestEdgeCases:
    @pytest.mark.asyncio
    async def test_empty_existing_entities(self, config):
        """New entity when no existing entities."""
        result = await resolve_entity(
            "NewEntity",
            [],
            mock_embed_fn,
            mock_llm_fn,
            config,
        )
        assert result.action == "new"
    @pytest.mark.asyncio
    async def test_disabled_resolution(self, existing_entities):
        """Resolution disabled returns new."""
        config = EntityResolutionConfig(enabled=False)
        result = await resolve_entity(
            "Dupixent",
            existing_entities,
            mock_embed_fn,
            mock_llm_fn,
            config,
        )
        assert result.action == "new"
        assert result.method == "disabled"
    @pytest.mark.asyncio
    async def test_genuinely_new_entity(self, existing_entities, config):
        """Completely new entity should return 'new'."""
        result = await resolve_entity(
            "CompletelyNewDrug",
            existing_entities,
            mock_embed_fn,
            mock_llm_fn,
            config,
        )
        assert result.action == "new"
        assert result.method == "none"