feat: add automatic entity resolution with 3-layer matching

Implement automatic entity resolution to prevent duplicate nodes in the knowledge graph. The system uses a 3-layer approach: 1. Case-insensitive exact matching (free, instant) 2. Fuzzy string matching >85% threshold (free, instant) 3. Vector similarity + LLM verification (for acronyms/synonyms) Key features: - Pre-resolution phase prevents race conditions in parallel processing - Numeric suffix detection blocks false matches (IL-4 ≠ IL-13) - PostgreSQL alias cache for fast lookups on subsequent ingestion - Configurable thresholds via environment variables Bug fixes included: - Fix fuzzy matching false positives for numbered entities - Fix alias cache not being populated (missing db parameter) - Skip entity_aliases table from generic id index creation New files: - lightrag/entity_resolution/ - Core resolution module - tests/test_entity_resolution/ - Unit tests - docker/postgres-age-vector/ - Custom PG image with pgvector + AGE - docker-compose.test.yml - Integration test environment Configuration (env.example): - ENTITY_RESOLUTION_ENABLED=true - ENTITY_RESOLUTION_FUZZY_THRESHOLD=0.85 - ENTITY_RESOLUTION_VECTOR_THRESHOLD=0.5 - ENTITY_RESOLUTION_MAX_CANDIDATES=3
2025-11-27 12:43:45 +01:00 · 2025-11-27 12:43:45 +01:00 · 48c7732edc
commit 48c7732edc
parent 4f12fe121d
20 changed files with 1561 additions and 101 deletions
--- a/docker-compose.test.yml
+++ b/docker-compose.test.yml
@ -0,0 +1,84 @@
+name: lightrag-entity-resolution-test
+
+services:
+  postgres:
+    container_name: lightrag-postgres
+    build:
+      context: ./docker/postgres-age-vector
+      dockerfile: Dockerfile
+    environment:
+      POSTGRES_DB: lightrag
+      POSTGRES_USER: lightrag
+      POSTGRES_PASSWORD: lightrag_pass
+    ports:
+      - "5433:5432"  # Use 5433 to avoid conflict with agent-sdk postgres
+    volumes:
+      - pgdata_test:/var/lib/postgresql/data
+    healthcheck:
+      test: ["CMD-SHELL", "pg_isready -U lightrag -d lightrag"]
+      interval: 5s
+      timeout: 5s
+      retries: 5
+
+  lightrag:
+    container_name: lightrag-test
+    build:
+      context: .
+      dockerfile: Dockerfile
+    ports:
+      - "9622:9621"  # Use 9622 to avoid conflict
+    volumes:
+      - ./data/rag_storage_test:/app/data/rag_storage
+      - ./data/inputs_test:/app/data/inputs
+    environment:
+      # Server
+      - HOST=0.0.0.0
+      - PORT=9621
+      - LOG_LEVEL=DEBUG
+
+      # LLM (OpenAI)
+      - LLM_BINDING=openai
+      - LLM_MODEL=gpt-4o-mini
+      - LLM_BINDING_HOST=https://api.openai.com/v1
+      - LLM_BINDING_API_KEY=${OPENAI_API_KEY}
+
+      # Embedding
+      - EMBEDDING_BINDING=openai
+      - EMBEDDING_MODEL=text-embedding-3-small
+      - EMBEDDING_DIM=1536
+      - EMBEDDING_BINDING_HOST=https://api.openai.com/v1
+      - EMBEDDING_BINDING_API_KEY=${OPENAI_API_KEY}
+
+      # Storage Configuration - Full PostgreSQL!
+      # Custom postgres image has pgvector + Apache AGE
+      - LIGHTRAG_KV_STORAGE=PGKVStorage
+      - LIGHTRAG_VECTOR_STORAGE=PGVectorStorage
+      - LIGHTRAG_GRAPH_STORAGE=PGGraphStorage
+      - LIGHTRAG_DOC_STATUS_STORAGE=PGDocStatusStorage
+      - POSTGRES_HOST=postgres
+      - POSTGRES_PORT=5432
+      - POSTGRES_USER=lightrag
+      - POSTGRES_PASSWORD=lightrag_pass
+      - POSTGRES_DATABASE=lightrag
+
+      # Entity Resolution - ENABLED!
+      - ENTITY_RESOLUTION_ENABLED=true
+      - ENTITY_RESOLUTION_FUZZY_THRESHOLD=0.85
+      - ENTITY_RESOLUTION_VECTOR_THRESHOLD=0.5
+      - ENTITY_RESOLUTION_MAX_CANDIDATES=3
+
+      # Processing
+      - MAX_ASYNC=4
+      - CHUNK_SIZE=1200
+    depends_on:
+      postgres:
+        condition: service_healthy
+    healthcheck:
+      test: ["CMD-SHELL", "curl -f http://localhost:9621/health || exit 1"]
+      interval: 10s
+      timeout: 5s
+      retries: 10
+      start_period: 30s
+
+volumes:
+  pgdata_test:
--- a/docker/postgres-age-vector/Dockerfile
+++ b/docker/postgres-age-vector/Dockerfile
@ -0,0 +1,26 @@
+# Start from pgvector image (has vector extension pre-built correctly)
+FROM pgvector/pgvector:pg17
+
+# Install build dependencies for AGE
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    git \
+    postgresql-server-dev-17 \
+    libreadline-dev \
+    zlib1g-dev \
+    flex \
+    bison \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install Apache AGE 1.6.0 for PG17
+RUN cd /tmp \
+    && git clone --branch release/PG17/1.6.0 https://github.com/apache/age.git \
+    && cd age \
+    && make \
+    && make install \
+    && rm -rf /tmp/age
+
+# Add initialization script to create extensions
+RUN echo "CREATE EXTENSION IF NOT EXISTS vector;" > /docker-entrypoint-initdb.d/01-vector.sql \
+    && echo "CREATE EXTENSION IF NOT EXISTS age;" > /docker-entrypoint-initdb.d/02-age.sql \
+    && echo "SET search_path = ag_catalog, public;" >> /docker-entrypoint-initdb.d/02-age.sql
--- a/env.example
+++ b/env.example
@ -127,6 +127,16 @@ SUMMARY_LANGUAGE=English
 ### Entity types that the LLM will attempt to recognize
 # ENTITY_TYPES='["Person", "Creature", "Organization", "Location", "Event", "Concept", "Method", "Content", "Data", "Artifact", "NaturalObject"]'

+###########################################################
+### Entity Resolution Configuration
+### Automatically deduplicates entities (e.g., "FDA" = "US Food and Drug Administration")
+### Uses 3-layer approach: case normalization → fuzzy matching → LLM verification
+###########################################################
+# ENTITY_RESOLUTION_ENABLED=true
+# ENTITY_RESOLUTION_FUZZY_THRESHOLD=0.85
+# ENTITY_RESOLUTION_VECTOR_THRESHOLD=0.5
+# ENTITY_RESOLUTION_MAX_CANDIDATES=3
+
 ### Chunk size for document splitting, 500~1500 is recommended
 # CHUNK_SIZE=1200
 # CHUNK_OVERLAP_SIZE=100
--- a/examples/test_entity_resolution.py
+++ b/examples/test_entity_resolution.py
@ -0,0 +1,93 @@
+"""
+Quick test for Entity Resolution feature.
+
+Tests that:
+1. "FDA" and "US Food and Drug Administration" resolve to the same entity
+2. "Dupixant" (typo) matches "Dupixent" via fuzzy matching
+"""
+
+import asyncio
+import os
+import shutil
+
+from lightrag import LightRAG
+from lightrag.entity_resolution import EntityResolutionConfig
+from lightrag.llm.openai import gpt_4o_mini_complete, openai_embed
+from lightrag.utils import logger
+import logging
+
+WORKING_DIR = "./test_entity_resolution"
+
+# Test document with entities that should be deduplicated
+TEST_DOC = """
+The FDA approved Dupixent for treating eczema in 2017.
+The US Food and Drug Administration later expanded the drug's indications.
+Dupixant (sometimes misspelled) has shown good results in clinical trials.
+The FDA continues to monitor the safety of Dupixent.
+"""
+
+
+async def main():
+    if not os.getenv("OPENAI_API_KEY"):
+        print("Error: Set OPENAI_API_KEY environment variable")
+        return
+
+    # Clean up previous test
+    if os.path.exists(WORKING_DIR):
+        shutil.rmtree(WORKING_DIR)
+    os.makedirs(WORKING_DIR)
+
+    # Set up logging to see resolution messages
+    logging.basicConfig(level=logging.DEBUG)
+    logger.setLevel(logging.DEBUG)
+
+    print("\n" + "=" * 60)
+    print("Entity Resolution Test")
+    print("=" * 60)
+
+    rag = LightRAG(
+        working_dir=WORKING_DIR,
+        embedding_func=openai_embed,
+        llm_model_func=gpt_4o_mini_complete,
+        entity_resolution_config=EntityResolutionConfig(
+            enabled=True,
+            fuzzy_threshold=0.85,
+            vector_threshold=0.5,
+            max_candidates=3,
+        ),
+    )
+
+    await rag.initialize_storages()
+
+    print("\nInserting test document...")
+    print(f"Document: {TEST_DOC.strip()}")
+    print("\n" + "-" * 60)
+
+    await rag.ainsert(TEST_DOC)
+
+    print("\n" + "-" * 60)
+    print("Checking extracted entities...")
+
+    # Read the graph to see what entities were created
+    graph_file = os.path.join(WORKING_DIR, "graph_chunk_entity_relation.graphml")
+    if os.path.exists(graph_file):
+        import networkx as nx
+
+        G = nx.read_graphml(graph_file)
+        print(f"\nEntities in graph ({len(G.nodes())} total):")
+        for node in sorted(G.nodes()):
+            print(f"  - {node}")
+
+        print(f"\nRelationships: {len(G.edges())}")
+    else:
+        print("Graph file not found")
+
+    await rag.finalize_storages()
+
+    print("\n" + "=" * 60)
+    print("Test complete!")
+    print("=" * 60)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/lightrag/api/config.py
+++ b/lightrag/api/config.py
@ -450,6 +450,20 @@ def parse_args() -> argparse.Namespace:
        "EMBEDDING_TOKEN_LIMIT", None, int, special_none=True
    )

+    # Entity Resolution configuration
+    args.entity_resolution_enabled = get_env_value(
+        "ENTITY_RESOLUTION_ENABLED", False, bool
+    )
+    args.entity_resolution_fuzzy_threshold = get_env_value(
+        "ENTITY_RESOLUTION_FUZZY_THRESHOLD", 0.85, float
+    )
+    args.entity_resolution_vector_threshold = get_env_value(
+        "ENTITY_RESOLUTION_VECTOR_THRESHOLD", 0.5, float
+    )
+    args.entity_resolution_max_candidates = get_env_value(
+        "ENTITY_RESOLUTION_MAX_CANDIDATES", 3, int
+    )
+
    ollama_server_infos.LIGHTRAG_NAME = args.simulated_model_name
    ollama_server_infos.LIGHTRAG_TAG = args.simulated_model_tag

--- a/lightrag/api/lightrag_server.py
+++ b/lightrag/api/lightrag_server.py
@ -2,6 +2,8 @@
 LightRAG FastAPI Server
 """

+from __future__ import annotations
+
 from fastapi import FastAPI, Depends, HTTPException, Request
 from fastapi.exceptions import RequestValidationError
 from fastapi.responses import JSONResponse
@ -642,6 +644,23 @@ def create_app(args):
                raise Exception(f"Failed to import {binding} options: {e}")
        return {}

+    def create_entity_resolution_config(args) -> object | None:
+        """
+        Create EntityResolutionConfig from command line/env arguments.
+        Returns None if entity resolution is disabled.
+        """
+        if not args.entity_resolution_enabled:
+            return None
+
+        from lightrag.entity_resolution import EntityResolutionConfig
+
+        return EntityResolutionConfig(
+            enabled=True,
+            fuzzy_threshold=args.entity_resolution_fuzzy_threshold,
+            vector_threshold=args.entity_resolution_vector_threshold,
+            max_candidates=args.entity_resolution_max_candidates,
+        )
+
    def create_optimized_embedding_function(
        config_cache: LLMConfigCache, binding, model, host, api_key, args
    ) -> EmbeddingFunc:
@ -1029,6 +1048,7 @@ def create_app(args):
                "entity_types": args.entity_types,
            },
            ollama_server_infos=ollama_server_infos,
+            entity_resolution_config=create_entity_resolution_config(args),
        )
    except Exception as e:
        logger.error(f"Failed to initialize LightRAG: {e}")
--- a/lightrag/api/routers/document_routes.py
+++ b/lightrag/api/routers/document_routes.py
@ -443,7 +443,7 @@ class DocStatusResponse(BaseModel):
    metadata: Optional[dict[str, Any]] = Field(
        default=None, description="Additional metadata about the document"
    )
-    file_path: str = Field(description="Path to the document file")
+    file_path: Optional[str] = Field(default=None, description="Path to the document file")

    class Config:
        json_schema_extra = {
--- a/lightrag/entity_resolution/init.py
+++ b/lightrag/entity_resolution/init.py
@ -0,0 +1,29 @@
+"""
+Entity Resolution Module for LightRAG
+
+Provides automatic entity deduplication using a 3-layer approach:
+1. Case normalization (exact match)
+2. Fuzzy string matching (typos)
+3. Vector similarity + LLM verification (semantic matches)
+"""
+
+from .resolver import (
+    resolve_entity,
+    resolve_entity_with_vdb,
+    ResolutionResult,
+    get_cached_alias,
+    store_alias,
+    fuzzy_similarity,
+)
+from .config import EntityResolutionConfig, DEFAULT_CONFIG
+
+__all__ = [
+    "resolve_entity",
+    "resolve_entity_with_vdb",
+    "ResolutionResult",
+    "EntityResolutionConfig",
+    "DEFAULT_CONFIG",
+    "get_cached_alias",
+    "store_alias",
+    "fuzzy_similarity",
+]
--- a/lightrag/entity_resolution/config.py
+++ b/lightrag/entity_resolution/config.py
@ -0,0 +1,57 @@
+"""Configuration for Entity Resolution
+
+Uses the same LLM that LightRAG is configured with - no separate model config needed.
+"""
+
+from dataclasses import dataclass, field
+
+
+@dataclass
+class EntityResolutionConfig:
+    """Configuration for the entity resolution system."""
+
+    # Whether entity resolution is enabled
+    enabled: bool = True
+
+    # Fuzzy pre-resolution: Enable/disable within-batch fuzzy matching before
+    # VDB lookup. When enabled, entities in the same batch are matched by string
+    # similarity alone. Set to False to skip fuzzy pre-resolution entirely (only
+    # exact case-insensitive matches will be accepted within batch; all other
+    # resolution goes to VDB/LLM). Disabling reduces false positives but may
+    # miss obvious typo corrections.
+    fuzzy_pre_resolution_enabled: bool = True
+
+    # Fuzzy string matching threshold (0-1)
+    # Above this = auto-match (catches typos like Dupixant/Dupixent at 0.88)
+    # Below this = continue to vector search
+    # Tuning advice:
+    #   0.90+ = Very conservative, near-identical strings (Dupixent/Dupixant)
+    #   0.85  = Balanced default, catches typos, avoids most false positives
+    #   0.80  = Aggressive, may merge distinct entities with similar names
+    #   <0.75 = Not recommended, high false positive risk (Celebrex/Cerebyx=0.67)
+    # Test with your domain data; pharmaceutical names need higher thresholds.
+    fuzzy_threshold: float = 0.85
+
+    # Vector similarity threshold for finding candidates
+    # Low threshold = cast wide net, LLM will verify
+    # 0.5 catches FDA/US Food and Drug Administration at 0.67
+    vector_threshold: float = 0.5
+
+    # Maximum number of vector candidates to verify with LLM
+    # Limits cost - uses same LLM as LightRAG main config
+    max_candidates: int = 3
+
+    # LLM verification prompt template
+    llm_prompt_template: str = field(
+        default="""Are these two terms referring to the same entity?
+Consider typos, misspellings, abbreviations, or alternate names.
+
+Term A: {term_a}
+Term B: {term_b}
+
+Answer only YES or NO.""",
+    )
+
+
+# Default configuration
+DEFAULT_CONFIG = EntityResolutionConfig()
--- a/lightrag/entity_resolution/resolver.py
+++ b/lightrag/entity_resolution/resolver.py
@ -0,0 +1,335 @@
+"""Entity Resolution - 3-Layer Approach
+
+Layer 1: Case normalization (exact match)
+Layer 2: Fuzzy string matching (>85% = typos)
+Layer 3: Vector similarity + LLM verification (semantic matches)
+
+Uses the same LLM that LightRAG is configured with.
+"""
+
+from collections.abc import Awaitable, Callable
+from dataclasses import dataclass
+from difflib import SequenceMatcher
+
+import numpy as np
+
+from lightrag.utils import logger
+from .config import DEFAULT_CONFIG, EntityResolutionConfig
+
+
+@dataclass
+class ResolutionResult:
+    """Result of entity resolution attempt."""
+
+    action: str  # "match" | "new"
+    matched_entity: str | None
+    confidence: float
+    method: str  # "exact" | "fuzzy" | "llm" | "none" | "disabled"
+
+
+def cosine_similarity(a: list[float], b: list[float]) -> float:
+    """Calculate cosine similarity between two vectors."""
+    a_arr, b_arr = np.array(a), np.array(b)
+    norm_a, norm_b = np.linalg.norm(a_arr), np.linalg.norm(b_arr)
+    if norm_a == 0 or norm_b == 0:
+        return 0.0
+    return float(np.dot(a_arr, b_arr) / (norm_a * norm_b))
+
+
+def fuzzy_similarity(a: str, b: str) -> float:
+    """Calculate fuzzy string similarity (0-1)."""
+    return SequenceMatcher(None, a.lower().strip(), b.lower().strip()).ratio()
+
+
+def find_vector_candidates(
+    query_embedding: list[float],
+    existing_entities: list[tuple[str, list[float]]],
+    threshold: float,
+) -> list[tuple[str, float]]:
+    """Find entities with vector similarity above threshold."""
+    candidates = []
+    for name, embedding in existing_entities:
+        sim = cosine_similarity(query_embedding, embedding)
+        if sim >= threshold:
+            candidates.append((name, sim))
+    # Sort by similarity descending
+    candidates.sort(key=lambda x: x[1], reverse=True)
+    return candidates
+
+
+async def llm_verify(
+    term_a: str,
+    term_b: str,
+    llm_fn: Callable[[str], Awaitable[str]],
+    prompt_template: str,
+) -> bool:
+    """Ask LLM if two terms refer to the same entity.
+
+    Uses strict parsing with exact token matching only. Accepted responses:
+    - Positive: "YES", "TRUE", "SAME", "MATCH"
+    - Negative: "NO", "FALSE", "DIFFERENT", "NOT SAME"
+    Any other response defaults to False to avoid false positive merges.
+    """
+    prompt = prompt_template.format(term_a=term_a, term_b=term_b)
+    response = await llm_fn(prompt)
+
+    # Normalize response: strip whitespace, take first line only
+    normalized = response.strip().split("\n")[0].strip().upper()
+
+    # Remove common trailing punctuation
+    normalized = normalized.rstrip(".!,")
+
+    # Only accept exact tokens (no prefix/substring matching)
+    if normalized in ("YES", "TRUE", "SAME", "MATCH"):
+        return True
+    if normalized in ("NO", "FALSE", "DIFFERENT", "NOT SAME"):
+        return False
+
+    # Default to False for ambiguous responses (safer than false positive)
+    return False
+
+
+async def resolve_entity(
+    entity_name: str,
+    existing_entities: list[tuple[str, list[float]]],
+    embed_fn: Callable[[str], Awaitable[list[float]]],
+    llm_fn: Callable[[str], Awaitable[str]],
+    config: EntityResolutionConfig = DEFAULT_CONFIG,
+) -> ResolutionResult:
+    """Resolve an entity against existing entities using 3-layer approach.
+
+    Args:
+        entity_name: The new entity name to resolve
+        existing_entities: List of (name, embedding) tuples for existing entities
+        embed_fn: Async function to get embedding for a string (same as LightRAG uses)
+        llm_fn: Async function to query LLM (same as LightRAG uses)
+        config: Resolution configuration
+
+    Returns:
+        ResolutionResult with action ("match" or "new"), matched entity,
+        confidence, and method used.
+    """
+    if not config.enabled:
+        return ResolutionResult("new", None, 0.0, "disabled")
+
+    if not existing_entities:
+        return ResolutionResult("new", None, 0.0, "none")
+
+    normalized = entity_name.lower().strip()
+
+    # Layer 1: Case-insensitive exact match
+    for name, _ in existing_entities:
+        if name.lower().strip() == normalized:
+            return ResolutionResult("match", name, 1.0, "exact")
+
+    # Layer 2: Fuzzy string matching (catches typos)
+    best_fuzzy_match = None
+    best_fuzzy_score = 0.0
+
+    for name, _ in existing_entities:
+        similarity = fuzzy_similarity(entity_name, name)
+        if similarity > best_fuzzy_score:
+            best_fuzzy_score = similarity
+            best_fuzzy_match = name
+
+    if best_fuzzy_score >= config.fuzzy_threshold:
+        return ResolutionResult("match", best_fuzzy_match, best_fuzzy_score, "fuzzy")
+
+    # Layer 3: Vector similarity + LLM verification
+    embedding = await embed_fn(entity_name)
+    candidates = find_vector_candidates(
+        embedding,
+        existing_entities,
+        config.vector_threshold,
+    )
+
+    # Verify top candidates with LLM
+    for candidate_name, similarity in candidates[: config.max_candidates]:
+        is_same = await llm_verify(
+            entity_name,
+            candidate_name,
+            llm_fn,
+            config.llm_prompt_template,
+        )
+        if is_same:
+            return ResolutionResult("match", candidate_name, similarity, "llm")
+
+    # No match found - this is a new entity
+    return ResolutionResult("new", None, 0.0, "none")
+
+
+async def resolve_entity_with_vdb(
+    entity_name: str,
+    entity_vdb,  # BaseVectorStorage - imported dynamically to avoid circular imports
+    llm_fn: Callable[[str], Awaitable[str]],
+    config: EntityResolutionConfig = DEFAULT_CONFIG,
+) -> ResolutionResult:
+    """Resolve an entity using VDB for similarity search.
+
+    This is the production integration that uses LightRAG's vector database
+    directly instead of requiring pre-computed embeddings.
+
+    Args:
+        entity_name: The new entity name to resolve
+        entity_vdb: LightRAG's entity vector database (BaseVectorStorage)
+        llm_fn: Async function to query LLM (same as LightRAG uses)
+        config: Resolution configuration
+
+    Returns:
+        ResolutionResult with action ("match" or "new"), matched entity,
+        confidence, and method used.
+    """
+    if not config.enabled:
+        return ResolutionResult("new", None, 0.0, "disabled")
+
+    if entity_vdb is None:
+        return ResolutionResult("new", None, 0.0, "none")
+
+    normalized = entity_name.lower().strip()
+
+    # Query VDB for similar entities - cast wide net, LLM will verify
+    # top_k is doubled to have enough candidates after filtering
+    try:
+        candidates = await entity_vdb.query(
+            entity_name, top_k=config.max_candidates * 3
+        )
+    except Exception as e:
+        # Log and skip resolution if VDB query fails
+        logger.debug(f"VDB query failed for '{entity_name}': {e}")
+        return ResolutionResult("new", None, 0.0, "none")
+
+    if not candidates:
+        return ResolutionResult("new", None, 0.0, "none")
+
+    # Layer 1: Case-insensitive exact match among candidates
+    for candidate in candidates:
+        candidate_name = candidate.get("entity_name")
+        if candidate_name and candidate_name.lower().strip() == normalized:
+            return ResolutionResult("match", candidate_name, 1.0, "exact")
+
+    # Layer 2: Fuzzy string matching (catches typos)
+    best_fuzzy_match = None
+    best_fuzzy_score = 0.0
+
+    for candidate in candidates:
+        candidate_name = candidate.get("entity_name")
+        if not candidate_name:
+            continue
+        similarity = fuzzy_similarity(entity_name, candidate_name)
+        if similarity > best_fuzzy_score:
+            best_fuzzy_score = similarity
+            best_fuzzy_match = candidate_name
+
+    if best_fuzzy_score >= config.fuzzy_threshold:
+        return ResolutionResult("match", best_fuzzy_match, best_fuzzy_score, "fuzzy")
+
+    # Layer 3: LLM verification on top candidates
+    verified_count = 0
+    for candidate in candidates:
+        if verified_count >= config.max_candidates:
+            break
+        candidate_name = candidate.get("entity_name")
+        if not candidate_name:
+            continue
+
+        is_same = await llm_verify(
+            entity_name,
+            candidate_name,
+            llm_fn,
+            config.llm_prompt_template,
+        )
+        verified_count += 1
+
+        if is_same:
+            # Use distance from VDB if available (converted to similarity)
+            similarity = 0.7  # Default confidence for LLM match
+            return ResolutionResult("match", candidate_name, similarity, "llm")
+
+    # No match found - this is a new entity
+    return ResolutionResult("new", None, 0.0, "none")
+
+
+# --- Alias Cache Functions (PostgreSQL) ---
+
+
+async def get_cached_alias(
+    alias: str,
+    db,  # PostgresDB instance
+    workspace: str,
+) -> tuple[str, str, float] | None:
+    """Check if alias is already resolved in cache.
+
+    Args:
+        alias: The entity name to look up
+        db: PostgresDB instance with query method
+        workspace: Workspace for isolation
+
+    Returns:
+        Tuple of (canonical_entity, method, confidence) if found, None otherwise
+    """
+    import logging
+
+    from lightrag.kg.postgres_impl import SQL_TEMPLATES
+
+    logger = logging.getLogger(__name__)
+    normalized_alias = alias.lower().strip()
+
+    sql = SQL_TEMPLATES["get_alias"]
+    try:
+        result = await db.query(sql, params=[workspace, normalized_alias])
+        if result:
+            return (
+                result["canonical_entity"],
+                result["method"],
+                result["confidence"],
+            )
+    except Exception as e:
+        logger.debug(f"Alias cache lookup error: {e}")
+    return None
+
+
+async def store_alias(
+    alias: str,
+    canonical: str,
+    method: str,
+    confidence: float,
+    db,  # PostgresDB instance
+    workspace: str,
+) -> None:
+    """Store a resolution in the alias cache.
+
+    Args:
+        alias: The variant name (e.g., "FDA")
+        canonical: The resolved canonical name (e.g., "US Food and Drug Administration")
+        method: How it was resolved ('exact', 'fuzzy', 'llm', 'manual')
+        confidence: Resolution confidence (0-1)
+        db: PostgresDB instance with execute method
+        workspace: Workspace for isolation
+    """
+    import logging
+    from datetime import datetime, timezone
+
+    from lightrag.kg.postgres_impl import SQL_TEMPLATES
+
+    logger = logging.getLogger(__name__)
+    normalized_alias = alias.lower().strip()
+
+    # Don't store self-referential aliases (e.g., "FDA" → "FDA")
+    if normalized_alias == canonical.lower().strip():
+        return
+
+    sql = SQL_TEMPLATES["upsert_alias"]
+    try:
+        await db.execute(
+            sql,
+            data={
+                "workspace": workspace,
+                "alias": normalized_alias,
+                "canonical_entity": canonical,
+                "method": method,
+                "confidence": confidence,
+                "create_time": datetime.now(timezone.utc).replace(tzinfo=None),
+            },
+        )
+    except Exception as e:
+        logger.debug(f"Alias cache store error: {e}")
--- a/lightrag/kg/postgres_impl.py
+++ b/lightrag/kg/postgres_impl.py
@ -1130,7 +1130,14 @@ class PostgreSQLDB:
                existing_indexes = {row["indexname"] for row in existing_indexes_result}

            # Create missing indexes
+            # Tables that don't have an 'id' column (use different primary key structure)
+            tables_without_id = {"LIGHTRAG_ENTITY_ALIASES"}
+
            for k in table_names:
+                # Skip tables that don't have an 'id' column
+                if k in tables_without_id:
+                    continue
+
                # Create index for id column if missing
                index_name = f"idx_{k.lower()}_id"
                if index_name not in existing_indexes:
@ -1259,6 +1266,12 @@ class PostgreSQLDB:
                f"PostgreSQL, Failed to create full entities/relations tables: {e}"
            )

+        # Migrate entity aliases table to add update_time, index on canonical_entity, and confidence constraint
+        try:
+            await self._migrate_entity_aliases_schema()
+        except Exception as e:
+            logger.error(f"PostgreSQL, Failed to migrate entity aliases schema: {e}")
+
    async def _migrate_create_full_entities_relations_tables(self):
        """Create LIGHTRAG_FULL_ENTITIES and LIGHTRAG_FULL_RELATIONS tables if they don't exist"""
        tables_to_check = [
@ -1323,6 +1336,127 @@ class PostgreSQLDB:
            except Exception as e:
                logger.error(f"Failed to create table {table_name}: {e}")

+    async def _migrate_entity_aliases_schema(self):
+        """Migrate LIGHTRAG_ENTITY_ALIASES table to add update_time column, canonical index, and confidence constraint"""
+        table_name = "LIGHTRAG_ENTITY_ALIASES"
+
+        # Check if table exists first
+        check_table_sql = """
+        SELECT table_name
+        FROM information_schema.tables
+        WHERE table_name = $1
+        AND table_schema = 'public'
+        """
+        table_exists = await self.query(check_table_sql, [table_name.lower()])
+        if not table_exists:
+            logger.debug(f"Table {table_name} does not exist yet, skipping migration")
+            return
+
+        # 1. Add update_time column if it doesn't exist
+        check_column_sql = """
+        SELECT column_name
+        FROM information_schema.columns
+        WHERE table_name = $1
+        AND column_name = 'update_time'
+        AND table_schema = 'public'
+        """
+        column_exists = await self.query(check_column_sql, [table_name.lower()])
+
+        if not column_exists:
+            try:
+                # Three-step migration to add update_time column:
+                # 1. Add column WITHOUT default - avoids full table rewrite on large tables
+                # 2. Backfill existing rows with create_time values
+                # 3. Set DEFAULT for future inserts
+                # Note: There's a tiny race window between steps 1-3 where concurrent
+                # inserts could get NULL. This is acceptable for this migration use case.
+                #
+                # Step 1: Add column WITHOUT default (existing rows get NULL)
+                add_column_sql = f"""
+                ALTER TABLE {table_name}
+                ADD COLUMN update_time TIMESTAMP(0)
+                """
+                await self.execute(add_column_sql)
+                logger.info(f"PostgreSQL, Added update_time column to {table_name}")
+
+                # Step 2: Set existing rows' update_time to their create_time
+                update_sql = f"""
+                UPDATE {table_name}
+                SET update_time = create_time
+                WHERE update_time IS NULL
+                """
+                await self.execute(update_sql)
+                logger.info(
+                    f"PostgreSQL, Initialized update_time values in {table_name}"
+                )
+
+                # Step 3: Set default for future rows
+                set_default_sql = f"""
+                ALTER TABLE {table_name}
+                ALTER COLUMN update_time SET DEFAULT CURRENT_TIMESTAMP
+                """
+                await self.execute(set_default_sql)
+                logger.info(
+                    f"PostgreSQL, Set default for update_time column in {table_name}"
+                )
+            except Exception as e:
+                logger.error(
+                    f"PostgreSQL, Failed to add update_time column to {table_name}: {e}"
+                )
+
+        # 2. Create index on (workspace, canonical_entity) for get_aliases_for_canonical query
+        index_name = "idx_lightrag_entity_aliases_canonical"
+        check_index_sql = """
+        SELECT indexname
+        FROM pg_indexes
+        WHERE tablename = $1
+        AND indexname = $2
+        """
+        index_exists = await self.query(
+            check_index_sql, [table_name.lower(), index_name]
+        )
+
+        if not index_exists:
+            try:
+                create_index_sql = f"""
+                CREATE INDEX {index_name}
+                ON {table_name} (workspace, canonical_entity)
+                """
+                await self.execute(create_index_sql)
+                logger.info(f"PostgreSQL, Created index {index_name} on {table_name}")
+            except Exception as e:
+                logger.error(f"PostgreSQL, Failed to create index {index_name}: {e}")
+
+        # 3. Add CHECK constraint for confidence range if it doesn't exist
+        constraint_name = "confidence_range"
+        check_constraint_sql = """
+        SELECT constraint_name
+        FROM information_schema.table_constraints
+        WHERE table_name = $1
+        AND constraint_name = $2
+        AND constraint_type = 'CHECK'
+        AND table_schema = 'public'
+        """
+        constraint_exists = await self.query(
+            check_constraint_sql, [table_name.lower(), constraint_name]
+        )
+
+        if not constraint_exists:
+            try:
+                add_constraint_sql = f"""
+                ALTER TABLE {table_name}
+                ADD CONSTRAINT {constraint_name}
+                CHECK (confidence >= 0 AND confidence <= 1)
+                """
+                await self.execute(add_constraint_sql)
+                logger.info(
+                    f"PostgreSQL, Added CHECK constraint {constraint_name} to {table_name}"
+                )
+            except Exception as e:
+                logger.warning(
+                    f"PostgreSQL, Failed to add CHECK constraint {constraint_name} to {table_name}: {e}"
+                )
+
    async def _create_pagination_indexes(self):
        """Create indexes to optimize pagination queries for LIGHTRAG_DOC_STATUS"""
        indexes = [
@ -1402,7 +1536,7 @@ class PostgreSQLDB:
            "VCHORDRQ": f"""
                CREATE INDEX {{vector_index_name}}
                ON {{k}} USING vchordrq (content_vector vector_cosine_ops)
-                {f'WITH (options = $${self.vchordrq_build_options}$$)' if self.vchordrq_build_options else ''}
+                {f"WITH (options = $${self.vchordrq_build_options}$$)" if self.vchordrq_build_options else ""}
            """,
        }

@ -4906,6 +5040,19 @@ TABLES = {
                    CONSTRAINT LIGHTRAG_RELATION_CHUNKS_PK PRIMARY KEY (workspace, id)
                    )"""
    },
+    "LIGHTRAG_ENTITY_ALIASES": {
+        "ddl": """CREATE TABLE LIGHTRAG_ENTITY_ALIASES (
+                    workspace VARCHAR(255),
+                    alias VARCHAR(512),
+                    canonical_entity VARCHAR(512),
+                    method VARCHAR(50),
+                    confidence FLOAT,
+                    create_time TIMESTAMP(0) DEFAULT CURRENT_TIMESTAMP,
+                    update_time TIMESTAMP(0) DEFAULT CURRENT_TIMESTAMP,
+                    CONSTRAINT LIGHTRAG_ENTITY_ALIASES_PK PRIMARY KEY (workspace, alias),
+                    CONSTRAINT confidence_range CHECK (confidence >= 0 AND confidence <= 1)
+                    )"""
+    },
 }


@ -5117,4 +5264,25 @@ SQL_TEMPLATES = {
    "drop_specifiy_table_workspace": """
        DELETE FROM {table_name} WHERE workspace=$1
       """,
+    # Entity alias cache
+    "get_alias": """
+        SELECT canonical_entity, method, confidence
+        FROM LIGHTRAG_ENTITY_ALIASES
+        WHERE workspace=$1 AND alias=$2
+        """,
+    "upsert_alias": """
+        INSERT INTO LIGHTRAG_ENTITY_ALIASES
+            (workspace, alias, canonical_entity, method, confidence, create_time, update_time)
+        VALUES ($1, $2, $3, $4, $5, $6, $6)
+        ON CONFLICT (workspace, alias) DO UPDATE SET
+            canonical_entity = EXCLUDED.canonical_entity,
+            method = EXCLUDED.method,
+            confidence = EXCLUDED.confidence,
+            update_time = CURRENT_TIMESTAMP
+        """,
+    "get_aliases_for_canonical": """
+        SELECT alias, method, confidence
+        FROM LIGHTRAG_ENTITY_ALIASES
+        WHERE workspace=$1 AND canonical_entity=$2
+        """,
 }
--- a/lightrag/lightrag.py
+++ b/lightrag/lightrag.py
@ -26,6 +26,7 @@ from typing import (
 )
 from lightrag.prompt import PROMPTS
 from lightrag.exceptions import PipelineCancelledException
+from lightrag.entity_resolution import EntityResolutionConfig
 from lightrag.constants import (
    DEFAULT_MAX_GLEANING,
    DEFAULT_FORCE_LLM_SUMMARY_ON_MERGE,
@ -217,6 +218,11 @@ class LightRAG:
        )
    )

+    entity_resolution_config: EntityResolutionConfig | None = field(default=None)
+    """Configuration for entity resolution (deduplication).
+    Set to EntityResolutionConfig() to enable, or None to disable.
+    Resolves entities like 'FDA' → 'US Food and Drug Administration'."""
+
    # Text chunking
    # ---

--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@ -5,6 +5,7 @@ from pathlib import Path
 import asyncio
 import json
 import json_repair
+import re
 from typing import Any, AsyncIterator, overload, Literal
 from collections import Counter, defaultdict

@ -50,6 +51,13 @@ from lightrag.base import (
    QueryContextResult,
 )
 from lightrag.prompt import PROMPTS
+from lightrag.entity_resolution import (
+    resolve_entity_with_vdb,
+    get_cached_alias,
+    store_alias,
+    fuzzy_similarity,
+    EntityResolutionConfig,
+)
 from lightrag.constants import (
    GRAPH_FIELD_SEP,
    DEFAULT_MAX_ENTITY_TOKENS,
@ -1590,6 +1598,180 @@ async def _rebuild_single_relationship(
            pipeline_status["history_messages"].append(status_message)


+def _has_different_numeric_suffix(name_a: str, name_b: str) -> bool:
+    """Check if two names have different numeric components.
+
+    This prevents false fuzzy matches between entities that differ only by number,
+    such as "Interleukin-4" vs "Interleukin-13" (88.9% similar but semantically distinct).
+
+    Scientific/medical entities often use numbers as key identifiers:
+    - Interleukins: IL-4, IL-13, IL-17
+    - Drug phases: Phase 1, Phase 2, Phase 3
+    - Receptor types: Type 1, Type 2
+    - Versions: v1.0, v2.0
+
+    Args:
+        name_a: First entity name
+        name_b: Second entity name
+
+    Returns:
+        True if both names contain numbers but the numbers differ, False otherwise.
+    """
+    # Extract all numeric patterns (integers and decimals)
+    pattern = r"(\d+(?:\.\d+)?)"
+    nums_a = re.findall(pattern, name_a)
+    nums_b = re.findall(pattern, name_b)
+
+    # If both have numbers and they differ, these are likely distinct entities
+    if nums_a and nums_b and nums_a != nums_b:
+        return True
+    return False
+
+
+async def _build_pre_resolution_map(
+    entity_names: list[str],
+    entity_types: dict[str, str],
+    entity_vdb,
+    llm_fn,
+    config: EntityResolutionConfig,
+) -> tuple[dict[str, str], dict[str, float]]:
+    """Build resolution map before parallel processing to prevent race conditions.
+
+    This function resolves entities against each other within the batch (using
+    instant fuzzy matching) and against existing VDB entries. The resulting map
+    is applied during parallel entity processing.
+
+    Args:
+        entity_names: List of entity names to resolve
+        entity_types: Dict mapping entity names to their types (e.g., "person", "organization").
+                      Used to prevent fuzzy matching between entities of different types.
+        entity_vdb: Entity vector database for checking existing entities
+        llm_fn: LLM function for semantic verification
+        config: Entity resolution configuration
+
+    Returns:
+        Tuple of:
+        - resolution_map: Dict mapping original entity names to their resolved canonical names.
+          Only entities that need remapping are included.
+        - confidence_map: Dict mapping alias to confidence score (1.0 for exact, actual
+          similarity for fuzzy, result.confidence for VDB matches).
+    """
+    resolution_map: dict[str, str] = {}
+    confidence_map: dict[str, float] = {}
+    # Track canonical entities with their types: [(name, type), ...]
+    canonical_entities: list[tuple[str, str]] = []
+
+    for entity_name in entity_names:
+        normalized = entity_name.lower().strip()
+        entity_type = entity_types.get(entity_name, "")
+
+        # Skip if already resolved to something in this batch
+        if entity_name in resolution_map:
+            continue
+
+        # Layer 1: Case-insensitive exact match within batch
+        matched = False
+        for canonical, canonical_type in canonical_entities:
+            if canonical.lower().strip() == normalized:
+                resolution_map[entity_name] = canonical
+                confidence_map[entity_name] = 1.0  # Exact match = perfect confidence
+                logger.debug(
+                    f"Pre-resolution (case match): '{entity_name}' → '{canonical}'"
+                )
+                matched = True
+                break
+
+        if matched:
+            continue
+
+        # Layer 2: Fuzzy match within batch (catches typos like Dupixant→Dupixent)
+        # Only enabled when config.fuzzy_pre_resolution_enabled is True.
+        # Requires: similarity >= threshold AND matching types (or unknown).
+        if config.fuzzy_pre_resolution_enabled:
+            for canonical, canonical_type in canonical_entities:
+                similarity = fuzzy_similarity(entity_name, canonical)
+                if similarity >= config.fuzzy_threshold:
+                    # Type compatibility check: skip if types differ and both known.
+                    # Empty/unknown types are treated as compatible to avoid
+                    # blocking legitimate matches when type info is incomplete.
+                    types_compatible = (
+                        not entity_type
+                        or not canonical_type
+                        or entity_type == canonical_type
+                    )
+                    if not types_compatible:
+                        logger.debug(
+                            f"Pre-resolution (fuzzy {similarity:.2f}): SKIPPED "
+                            f"'{entity_name}' ({entity_type}) → "
+                            f"'{canonical}' ({canonical_type}) - type mismatch"
+                        )
+                        continue
+
+                    # Numeric suffix check: skip if names have different numbers
+                    # This prevents false matches like "Interleukin-4" → "Interleukin-13"
+                    # where fuzzy similarity is high (88.9%) but entities are distinct
+                    if _has_different_numeric_suffix(entity_name, canonical):
+                        logger.debug(
+                            f"Pre-resolution (fuzzy {similarity:.2f}): SKIPPED "
+                            f"'{entity_name}' → '{canonical}' - different numeric suffix"
+                        )
+                        continue
+
+                    # Accept the fuzzy match - emit warning for review
+                    resolution_map[entity_name] = canonical
+                    confidence_map[entity_name] = (
+                        similarity  # Use actual similarity score
+                    )
+                    etype_display = entity_type or "unknown"
+                    ctype_display = canonical_type or "unknown"
+                    logger.warning(
+                        f"Fuzzy pre-resolution accepted: '{entity_name}' → "
+                        f"'{canonical}' (similarity={similarity:.3f}, "
+                        f"types: {etype_display}→{ctype_display}). "
+                        f"Review for correctness; adjust fuzzy_threshold or "
+                        f"disable fuzzy_pre_resolution_enabled if needed."
+                    )
+                    matched = True
+                    break
+
+        if matched:
+            continue
+
+        # Layer 3: Check existing VDB for cross-document deduplication
+        if entity_vdb and llm_fn:
+            try:
+                result = await resolve_entity_with_vdb(
+                    entity_name, entity_vdb, llm_fn, config
+                )
+                if result.action == "match" and result.matched_entity:
+                    resolution_map[entity_name] = result.matched_entity
+                    confidence_map[entity_name] = (
+                        result.confidence
+                    )  # Use VDB result confidence
+                    # Add canonical from VDB so batch entities can match it.
+                    # VDB matches don't have type info available, use empty.
+                    canonical_entities.append((result.matched_entity, ""))
+                    logger.debug(
+                        f"Pre-resolution (VDB {result.method}): "
+                        f"'{entity_name}' → '{result.matched_entity}'"
+                    )
+                    continue
+            except Exception as e:
+                logger.debug(
+                    f"Pre-resolution VDB check failed for '{entity_name}': {e}"
+                )
+
+        # No match found - this is a new canonical entity
+        canonical_entities.append((entity_name, entity_type))
+
+    if resolution_map:
+        logger.info(
+            f"Pre-resolution: {len(resolution_map)} entities mapped to canonical forms"
+        )
+
+    return resolution_map, confidence_map
+
+
 async def _merge_nodes_then_upsert(
    entity_name: str,
    nodes_data: list[dict],
@ -1600,8 +1782,128 @@ async def _merge_nodes_then_upsert(
    pipeline_status_lock=None,
    llm_response_cache: BaseKVStorage | None = None,
    entity_chunks_storage: BaseKVStorage | None = None,
-):
-    """Get existing nodes from knowledge graph use name,if exists, merge data, else create, then upsert."""
+    pre_resolution_map: dict[str, str] | None = None,
+) -> tuple[dict, str | None]:
+    """Get existing nodes from knowledge graph use name,if exists, merge data, else create, then upsert.
+
+    Returns:
+        Tuple of (node_data, original_entity_name). original_entity_name is set if
+        entity resolution changed the name (e.g., "Dupixant" → "Dupixent"),
+        otherwise None.
+    """
+    original_entity_name = entity_name  # Track original before resolution
+
+    # Apply pre-resolution map immediately (prevents race conditions in parallel processing)
+    pre_resolved = False
+    if pre_resolution_map and entity_name in pre_resolution_map:
+        entity_name = pre_resolution_map[entity_name]
+        pre_resolved = True
+        logger.debug(
+            f"Applied pre-resolution: '{original_entity_name}' → '{entity_name}'"
+        )
+
+    # Entity Resolution: Resolve new entity against existing entities
+    # Skip if already pre-resolved (to avoid redundant VDB queries)
+    entity_resolution_config_raw = global_config.get("entity_resolution_config")
+    entity_resolution_config = None
+    if entity_resolution_config_raw:
+        # Handle both dict (from asdict() serialization) and EntityResolutionConfig instances
+        if isinstance(entity_resolution_config_raw, EntityResolutionConfig):
+            entity_resolution_config = entity_resolution_config_raw
+        elif isinstance(entity_resolution_config_raw, dict):
+            try:
+                entity_resolution_config = EntityResolutionConfig(
+                    **entity_resolution_config_raw
+                )
+            except TypeError as e:
+                logger.warning(
+                    f"Invalid entity_resolution_config: {e}. "
+                    f"Config: {entity_resolution_config_raw}. Skipping resolution."
+                )
+
+    # Safely check if entity resolution is enabled, handling both object and dict forms
+    def _is_resolution_enabled(config) -> bool:
+        if config is None:
+            return False
+        if isinstance(config, dict):
+            return config.get("enabled", False)
+        return getattr(config, "enabled", False)
+
+    # Skip VDB resolution if entity was already pre-resolved (prevents redundant queries)
+    if (
+        _is_resolution_enabled(entity_resolution_config)
+        and entity_vdb
+        and not pre_resolved
+    ):
+        original_name = entity_name
+        workspace = global_config.get("workspace", "")
+        # Try knowledge_graph_inst.db first (more reliable), fallback to entity_vdb.db
+        db = getattr(knowledge_graph_inst, "db", None) or getattr(
+            entity_vdb, "db", None
+        )
+
+        # Layer 0: Check alias cache first (PostgreSQL-only - requires db connection)
+        # Note: Alias caching is only available when using PostgreSQL storage backend
+        if db is not None:
+            try:
+                cached = await get_cached_alias(original_name, db, workspace)
+                if cached:
+                    canonical, method, _ = cached
+                    logger.debug(
+                        f"Alias cache hit: '{original_name}' → '{canonical}' "
+                        f"(method: {method})"
+                    )
+                    entity_name = canonical
+            except Exception as e:
+                logger.warning(
+                    f"Entity resolution cache lookup failed for '{original_name}' "
+                    f"(workspace: {workspace}): {type(e).__name__}: {e}. "
+                    "Continuing without cache."
+                )
+
+        # Layers 1-3: Full VDB resolution (if not found in cache)
+        if entity_name == original_name:
+            llm_fn = global_config.get("llm_model_func")
+            if llm_fn:
+                try:
+                    resolution = await resolve_entity_with_vdb(
+                        entity_name,
+                        entity_vdb,
+                        llm_fn,
+                        entity_resolution_config,
+                    )
+                    if resolution.action == "match" and resolution.matched_entity:
+                        logger.info(
+                            f"Entity resolution: '{entity_name}' → '{resolution.matched_entity}' "
+                            f"(method: {resolution.method}, confidence: {resolution.confidence:.2f})"
+                        )
+                        entity_name = resolution.matched_entity
+
+                        # Store in alias cache for next time (PostgreSQL-only)
+                        # Note: Alias caching requires PostgreSQL storage backend
+                        if db is not None:
+                            try:
+                                await store_alias(
+                                    original_name,
+                                    entity_name,
+                                    resolution.method,
+                                    resolution.confidence,
+                                    db,
+                                    workspace,
+                                )
+                            except Exception as e:
+                                logger.warning(
+                                    f"Failed to store entity alias '{original_name}' → '{entity_name}' "
+                                    f"(workspace: {workspace}): {type(e).__name__}: {e}. "
+                                    "Resolution succeeded but cache not updated."
+                                )
+                except Exception as e:
+                    logger.warning(
+                        f"Entity resolution failed for '{original_name}' "
+                        f"(workspace: {workspace}): {type(e).__name__}: {e}. "
+                        "Continuing with original entity name."
+                    )
+
    already_entity_types = []
    already_source_ids = []
    already_description = []
@ -1865,7 +2167,12 @@ async def _merge_nodes_then_upsert(
            max_retries=3,
            retry_delay=0.1,
        )
-    return node_data
+
+    # Return original name if resolution changed it, None otherwise
+    resolved_from = (
+        original_entity_name if entity_name != original_entity_name else None
+    )
+    return node_data, resolved_from


 async def _merge_edges_then_upsert(
@ -1882,7 +2189,13 @@ async def _merge_edges_then_upsert(
    added_entities: list = None,  # New parameter to track entities added during edge processing
    relation_chunks_storage: BaseKVStorage | None = None,
    entity_chunks_storage: BaseKVStorage | None = None,
+    entity_resolution_map: dict[str, str] | None = None,  # Map original→resolved names
 ):
+    # Apply entity resolution mapping to edge endpoints
+    if entity_resolution_map:
+        src_id = entity_resolution_map.get(src_id, src_id)
+        tgt_id = entity_resolution_map.get(tgt_id, tgt_id)
+
    if src_id == tgt_id:
        return None

@ -2472,6 +2785,76 @@ async def merge_nodes_and_edges(
    graph_max_async = global_config.get("llm_model_max_async", 4) * 2
    semaphore = asyncio.Semaphore(graph_max_async)

+    # ===== Pre-Resolution Phase: Build entity resolution map =====
+    # This prevents race conditions when parallel workers process similar entities
+    # IMPORTANT: Include BOTH entity names AND relation endpoints to catch all duplicates
+    pre_resolution_map: dict[str, str] = {}
+    entity_resolution_config_raw = global_config.get("entity_resolution_config")
+    if entity_resolution_config_raw:
+        # Handle both dict (from asdict() serialization) and EntityResolutionConfig instances
+        config = None
+        if isinstance(entity_resolution_config_raw, EntityResolutionConfig):
+            config = entity_resolution_config_raw
+        elif isinstance(entity_resolution_config_raw, dict):
+            try:
+                config = EntityResolutionConfig(**entity_resolution_config_raw)
+            except TypeError as e:
+                logger.warning(
+                    f"Invalid entity_resolution_config: {e}. "
+                    f"Config: {entity_resolution_config_raw}. Skipping resolution."
+                )
+        if config and config.enabled:
+            llm_fn = global_config.get("llm_model_func")
+            # Build entity_types map for type-aware fuzzy matching.
+            # Use first non-empty type for entities with multiple occurrences.
+            entity_types: dict[str, str] = {}
+            for entity_name, entities in all_nodes.items():
+                for entity_data in entities:
+                    etype = entity_data.get("entity_type", "")
+                    if etype:
+                        entity_types[entity_name] = etype
+                        break
+
+            # Collect ALL entity names: from entities AND from relation endpoints
+            # This ensures relation endpoints like "EU Medicines Agency" get resolved
+            # against existing entities like "European Medicines Agency"
+            all_entity_names = set(all_nodes.keys())
+            for src_id, tgt_id in all_edges.keys():
+                all_entity_names.add(src_id)
+                all_entity_names.add(tgt_id)
+
+            pre_resolution_map, confidence_map = await _build_pre_resolution_map(
+                list(all_entity_names),
+                entity_types,
+                entity_vdb,
+                llm_fn,
+                config,
+            )
+
+            # Cache pre-resolution aliases for future lookups (PostgreSQL-only)
+            # This ensures aliases discovered during batch processing are available
+            # for subsequent document ingestion without re-running resolution
+            db = getattr(knowledge_graph_inst, "db", None)
+            if db is not None and pre_resolution_map:
+                workspace = global_config.get("workspace", "")
+                for alias, canonical in pre_resolution_map.items():
+                    # Don't cache self-references (entity → itself)
+                    if alias.lower().strip() != canonical.lower().strip():
+                        try:
+                            await store_alias(
+                                alias=alias,
+                                canonical=canonical,
+                                method="pre_resolution",
+                                confidence=confidence_map.get(alias, 1.0),
+                                db=db,
+                                workspace=workspace,
+                            )
+                        except Exception as e:
+                            logger.debug(
+                                f"Failed to cache pre-resolution alias "
+                                f"'{alias}' → '{canonical}': {e}"
+                            )
+
    # ===== Phase 1: Process all entities concurrently =====
    log_message = f"Phase 1: Processing {total_entities_count} entities from {doc_id} (async: {graph_max_async})"
    logger.info(log_message)
@ -2479,6 +2862,11 @@ async def merge_nodes_and_edges(
        pipeline_status["latest_message"] = log_message
        pipeline_status["history_messages"].append(log_message)

+    # Resolution map to track original→resolved entity names (e.g., "Dupixant"→"Dupixent")
+    # This will be used to remap edge endpoints in Phase 2
+    entity_resolution_map: dict[str, str] = {}
+    resolution_map_lock = asyncio.Lock()
+
    async def _locked_process_entity_name(entity_name, entities):
        async with semaphore:
            # Check for cancellation before processing entity
@ -2496,7 +2884,7 @@ async def merge_nodes_and_edges(
            ):
                try:
                    logger.debug(f"Processing entity {entity_name}")
-                    entity_data = await _merge_nodes_then_upsert(
+                    entity_data, resolved_from = await _merge_nodes_then_upsert(
                        entity_name,
                        entities,
                        knowledge_graph_inst,
@ -2506,8 +2894,15 @@ async def merge_nodes_and_edges(
                        pipeline_status_lock,
                        llm_response_cache,
                        entity_chunks_storage,
+                        pre_resolution_map,
                    )

+                    # Track resolution mapping for edge remapping in Phase 2
+                    if resolved_from is not None:
+                        resolved_to = entity_data.get("entity_name", entity_name)
+                        async with resolution_map_lock:
+                            entity_resolution_map[resolved_from] = resolved_to
+
                    return entity_data

                except Exception as e:
@ -2617,6 +3012,7 @@ async def merge_nodes_and_edges(
                        added_entities,  # Pass list to collect added entities
                        relation_chunks_storage,
                        entity_chunks_storage,  # Add entity_chunks_storage parameter
+                        entity_resolution_map,  # Apply entity resolution to edge endpoints
                    )

                    if edge_data is None:
@ -2649,9 +3045,36 @@ async def merge_nodes_and_edges(
                    raise prefixed_exception from e

    # Create relationship processing tasks
-    edge_tasks = []
+    # Apply pre_resolution_map to edge endpoints to prevent duplicates from relation extraction
+    # Key fixes: sort for lock ordering, filter self-loops, deduplicate merged edges
+    resolved_edges: dict[tuple[str, str], list] = {}
    for edge_key, edges in all_edges.items():
-        task = asyncio.create_task(_locked_process_edges(edge_key, edges))
+        # Remap edge endpoints using pre-resolution map
+        # This catches cases like "EU Medicines Agency" → "European Medicines Agency"
+        resolved_src = pre_resolution_map.get(edge_key[0], edge_key[0])
+        resolved_tgt = pre_resolution_map.get(edge_key[1], edge_key[1])
+
+        # Skip self-loops created by resolution (e.g., both endpoints resolve to same entity)
+        if resolved_src == resolved_tgt:
+            logger.debug(
+                f"Skipping self-loop after resolution: {edge_key} → ({resolved_src}, {resolved_tgt})"
+            )
+            continue
+
+        # Sort for consistent lock ordering (prevents deadlocks)
+        resolved_edge_key = tuple(sorted([resolved_src, resolved_tgt]))
+
+        # Merge edges that resolve to same key (deduplication)
+        if resolved_edge_key not in resolved_edges:
+            resolved_edges[resolved_edge_key] = []
+        resolved_edges[resolved_edge_key].extend(edges)
+
+    # Create tasks from deduplicated edges
+    edge_tasks = []
+    for resolved_edge_key, merged_edges in resolved_edges.items():
+        task = asyncio.create_task(
+            _locked_process_edges(resolved_edge_key, merged_edges)
+        )
        edge_tasks.append(task)

    # Execute relationship tasks with error handling
--- a/lightrag_webui/src/App.tsx
+++ b/lightrag_webui/src/App.tsx
@ -2,7 +2,6 @@ import { useState, useCallback, useEffect, useRef } from 'react'
 import ThemeProvider from '@/components/ThemeProvider'
 import TabVisibilityProvider from '@/contexts/TabVisibilityProvider'
 import ApiKeyAlert from '@/components/ApiKeyAlert'
-import StatusIndicator from '@/components/status/StatusIndicator'
 import { SiteInfo, webuiPrefix } from '@/lib/constants'
 import { useBackendState, useAuthStore } from '@/stores/state'
 import { useSettingsStore } from '@/stores/settings'
@ -218,7 +217,6 @@ function App() {
                </TabsContent>
              </div>
            </Tabs>
-            {enableHealthCheck && <StatusIndicator />}
            <ApiKeyAlert open={apiKeyAlertOpen} onOpenChange={handleApiKeyAlertOpenChange} />
          </main>
        )}
--- a/lightrag_webui/src/components/AppSettings.tsx
+++ b/lightrag_webui/src/components/AppSettings.tsx
@ -1,9 +1,7 @@
-import { useState, useCallback } from 'react'
-import { Popover, PopoverContent, PopoverTrigger } from '@/components/ui/Popover'
+import { useCallback } from 'react'
 import Button from '@/components/ui/Button'
-import { Select, SelectContent, SelectItem, SelectTrigger, SelectValue } from '@/components/ui/Select'
 import { useSettingsStore } from '@/stores/settings'
-import { PaletteIcon } from 'lucide-react'
+import { SunIcon, MoonIcon } from 'lucide-react'
 import { useTranslation } from 'react-i18next'
 import { cn } from '@/lib/utils'

@ -12,63 +10,39 @@ interface AppSettingsProps {
 }

 export default function AppSettings({ className }: AppSettingsProps) {
-  const [opened, setOpened] = useState<boolean>(false)
  const { t } = useTranslation()

-  const language = useSettingsStore.use.language()
-  const setLanguage = useSettingsStore.use.setLanguage()
-
  const theme = useSettingsStore.use.theme()
  const setTheme = useSettingsStore.use.setTheme()

-  const handleLanguageChange = useCallback((value: string) => {
-    setLanguage(value as 'en' | 'zh' | 'fr' | 'ar' | 'zh_TW')
-  }, [setLanguage])
+  // Compute effective theme for icon/tooltip display when theme is 'system'
+  const effectiveTheme = theme === 'system'
+    ? (window.matchMedia('(prefers-color-scheme: dark)').matches ? 'dark' : 'light')
+    : theme

-  const handleThemeChange = useCallback((value: string) => {
-    setTheme(value as 'light' | 'dark' | 'system')
-  }, [setTheme])
+  const handleThemeToggle = useCallback(() => {
+    if (theme === 'system') {
+      // Detect actual system preference and toggle to opposite
+      const isDark = window.matchMedia('(prefers-color-scheme: dark)').matches
+      setTheme(isDark ? 'light' : 'dark')
+    } else {
+      setTheme(theme === 'dark' ? 'light' : 'dark')
+    }
+  }, [theme, setTheme])

  return (
-    <Popover open={opened} onOpenChange={setOpened}>
-      <PopoverTrigger asChild>
-        <Button variant="ghost" size="icon" className={cn('h-9 w-9', className)}>
-          <PaletteIcon className="h-5 w-5" />
-        </Button>
-      </PopoverTrigger>
-      <PopoverContent side="bottom" align="end" className="w-56">
-        <div className="flex flex-col gap-4">
-          <div className="flex flex-col gap-2">
-            <label className="text-sm font-medium">{t('settings.language')}</label>
-            <Select value={language} onValueChange={handleLanguageChange}>
-              <SelectTrigger>
-                <SelectValue />
-              </SelectTrigger>
-              <SelectContent>
-                <SelectItem value="en">English</SelectItem>
-                <SelectItem value="zh">中文</SelectItem>
-                <SelectItem value="fr">Français</SelectItem>
-                <SelectItem value="ar">العربية</SelectItem>
-                <SelectItem value="zh_TW">繁體中文</SelectItem>
-              </SelectContent>
-            </Select>
-          </div>
-
-          <div className="flex flex-col gap-2">
-            <label className="text-sm font-medium">{t('settings.theme')}</label>
-            <Select value={theme} onValueChange={handleThemeChange}>
-              <SelectTrigger>
-                <SelectValue />
-              </SelectTrigger>
-              <SelectContent>
-                <SelectItem value="light">{t('settings.light')}</SelectItem>
-                <SelectItem value="dark">{t('settings.dark')}</SelectItem>
-                <SelectItem value="system">{t('settings.system')}</SelectItem>
-              </SelectContent>
-            </Select>
-          </div>
-        </div>
-      </PopoverContent>
-    </Popover>
+    <Button
+      variant="ghost"
+      size="icon"
+      className={cn('h-9 w-9', className)}
+      onClick={handleThemeToggle}
+      tooltip={effectiveTheme === 'dark' ? t('settings.light') : t('settings.dark')}
+    >
+      {effectiveTheme === 'dark' ? (
+        <MoonIcon className="h-5 w-5" />
+      ) : (
+        <SunIcon className="h-5 w-5" />
+      )}
+    </Button>
  )
 }
--- a/lightrag_webui/src/components/status/StatusIndicator.tsx
+++ b/lightrag_webui/src/components/status/StatusIndicator.tsx
@ -4,7 +4,7 @@ import { useEffect, useState } from 'react'
 import StatusDialog from './StatusDialog'
 import { useTranslation } from 'react-i18next'

-const StatusIndicator = () => {
+const StatusIndicator = ({ className }: { className?: string }) => {
  const { t } = useTranslation()
  const health = useBackendState.use.health()
  const lastCheckTime = useBackendState.use.lastCheckTime()
@ -20,7 +20,7 @@ const StatusIndicator = () => {
  }, [lastCheckTime])

  return (
-    <div className="fixed right-4 bottom-4 flex items-center gap-2 opacity-80 select-none">
+    <div className={cn("flex items-center gap-2 opacity-80 select-none", className)}>
      <div
        className="flex cursor-pointer items-center gap-2"
        onClick={() => setDialogOpen(true)}
--- a/lightrag_webui/src/features/SiteHeader.tsx
+++ b/lightrag_webui/src/features/SiteHeader.tsx
@ -1,13 +1,14 @@
 import Button from '@/components/ui/Button'
-import { SiteInfo, webuiPrefix } from '@/lib/constants'
+import { webuiPrefix } from '@/lib/constants'
 import AppSettings from '@/components/AppSettings'
+import StatusIndicator from '@/components/status/StatusIndicator'
 import { TabsList, TabsTrigger } from '@/components/ui/Tabs'
 import { useSettingsStore } from '@/stores/settings'
 import { useAuthStore } from '@/stores/state'
 import { cn } from '@/lib/utils'
 import { useTranslation } from 'react-i18next'
 import { navigationService } from '@/services/navigation'
-import { ZapIcon, GithubIcon, LogOutIcon } from 'lucide-react'
+import { ZapIcon, LogOutIcon } from 'lucide-react'
 import { Tooltip, TooltipContent, TooltipProvider, TooltipTrigger } from '@/components/ui/Tooltip'

 interface NavigationTabProps {
@ -56,17 +57,8 @@ function TabsNavigation() {

 export default function SiteHeader() {
  const { t } = useTranslation()
-  const { isGuestMode, coreVersion, apiVersion, username, webuiTitle, webuiDescription } = useAuthStore()
-
-  const versionDisplay = (coreVersion && apiVersion)
-    ? `${coreVersion}/${apiVersion}`
-    : null;
-
-  // Check if frontend needs rebuild (apiVersion ends with warning symbol)
-  const hasWarning = apiVersion?.endsWith('⚠️');
-  const versionTooltip = hasWarning
-    ? t('header.frontendNeedsRebuild')
-    : versionDisplay ? `v${versionDisplay}` : '';
+  const { isGuestMode, username, webuiTitle, webuiDescription } = useAuthStore()
+  const enableHealthCheck = useSettingsStore.use.enableHealthCheck()

  const handleLogout = () => {
    navigationService.navigateToLogin();
@ -77,7 +69,6 @@ export default function SiteHeader() {
      <div className="min-w-[200px] w-auto flex items-center">
        <a href={webuiPrefix} className="flex items-center gap-2">
          <ZapIcon className="size-4 text-emerald-400" aria-hidden="true" />
-          <span className="font-bold md:inline-block">{SiteInfo.name}</span>
        </a>
        {webuiTitle && (
          <div className="flex items-center">
@ -111,25 +102,7 @@ export default function SiteHeader() {

      <nav className="w-[200px] flex items-center justify-end">
        <div className="flex items-center gap-2">
-          {versionDisplay && (
-            <TooltipProvider>
-              <Tooltip>
-                <TooltipTrigger asChild>
-                  <span className="text-xs text-gray-500 dark:text-gray-400 mr-1 cursor-default">
-                    v{versionDisplay}
-                  </span>
-                </TooltipTrigger>
-                <TooltipContent side="bottom">
-                  {versionTooltip}
-                </TooltipContent>
-              </Tooltip>
-            </TooltipProvider>
-          )}
-          <Button variant="ghost" size="icon" side="bottom" tooltip={t('header.projectRepository')}>
-            <a href={SiteInfo.github} target="_blank" rel="noopener noreferrer">
-              <GithubIcon className="size-4" aria-hidden="true" />
-            </a>
-          </Button>
+          {enableHealthCheck && <StatusIndicator />}
          <AppSettings />
          {!isGuestMode && (
            <Button
--- a/lightrag_webui/src/stores/settings.ts
+++ b/lightrag_webui/src/stores/settings.ts
@ -87,7 +87,7 @@ interface SettingsState {
 const useSettingsStoreBase = create<SettingsState>()(
  persist(
    (set) => ({
-      theme: 'system',
+      theme: 'light',
      language: 'en',
      showPropertyPanel: true,
      showNodeSearchBar: true,
@ -238,7 +238,7 @@ const useSettingsStoreBase = create<SettingsState>()(
    {
      name: 'settings-storage',
      storage: createJSONStorage(() => localStorage),
-      version: 19,
+      version: 20,
      migrate: (state: any, version: number) => {
        if (version < 2) {
          state.showEdgeLabel = false
@ -341,6 +341,15 @@ const useSettingsStoreBase = create<SettingsState>()(
            delete state.querySettings.response_type
          }
        }
+        if (version < 20) {
+          // Only set defaults if values are missing (preserve user preference)
+          if (!state.theme) {
+            state.theme = 'light'
+          }
+          if (!state.language) {
+            state.language = 'en'
+          }
+        }
        return state
      }
    }
--- a/tests/test_entity_resolution/init.py
+++ b/tests/test_entity_resolution/init.py
@ -0,0 +1 @@
+# Entity resolution tests
--- a/tests/test_entity_resolution/test_resolver.py
+++ b/tests/test_entity_resolution/test_resolver.py
@ -0,0 +1,240 @@
+"""
+Unit tests for Entity Resolution
+
+Tests the 3-layer approach with mock embed_fn and llm_fn.
+No database or external services required.
+"""
+
+import pytest
+
+from lightrag.entity_resolution import (
+    EntityResolutionConfig,
+    resolve_entity,
+)
+
+# Mock embeddings - pre-computed for test entities
+# These simulate what an embedding model would return
+MOCK_EMBEDDINGS = {
+    # FDA and full name have ~0.67 similarity (based on real test)
+    "fda": [0.1, 0.2, 0.3, 0.4, 0.5],
+    "us food and drug administration": [0.15, 0.25, 0.28, 0.38, 0.52],
+    # Dupixent and dupilumab have ~0.63 similarity
+    "dupixent": [0.5, 0.6, 0.7, 0.8, 0.9],
+    "dupilumab": [0.48, 0.58, 0.72, 0.78, 0.88],
+    # Celebrex and Cerebyx are different (low similarity)
+    "celebrex": [0.9, 0.1, 0.2, 0.3, 0.4],
+    "cerebyx": [0.1, 0.9, 0.8, 0.7, 0.6],
+    # Default for unknown entities
+    "default": [0.0, 0.0, 0.0, 0.0, 0.0],
+}
+
+# Mock LLM responses
+MOCK_LLM_RESPONSES = {
+    ("fda", "us food and drug administration"): "YES",
+    ("us food and drug administration", "fda"): "YES",
+    ("dupixent", "dupilumab"): "YES",
+    ("dupilumab", "dupixent"): "YES",
+    ("heart attack", "myocardial infarction"): "YES",
+    ("celebrex", "cerebyx"): "NO",
+    ("metformin", "metoprolol"): "NO",
+}
+
+
+async def mock_embed_fn(text: str) -> list[float]:
+    """Mock embedding function."""
+    key = text.lower().strip()
+    return MOCK_EMBEDDINGS.get(key, MOCK_EMBEDDINGS["default"])
+
+
+async def mock_llm_fn(prompt: str) -> str:
+    """Mock LLM function that parses the prompt and returns YES/NO."""
+    # Extract term_a and term_b from the prompt
+    lines = prompt.strip().split("\n")
+    term_a = None
+    term_b = None
+    for line in lines:
+        if line.startswith("Term A:"):
+            term_a = line.replace("Term A:", "").strip().lower()
+        elif line.startswith("Term B:"):
+            term_b = line.replace("Term B:", "").strip().lower()
+
+    if term_a and term_b:
+        # Check both orderings
+        response = MOCK_LLM_RESPONSES.get((term_a, term_b))
+        if response is None:
+            response = MOCK_LLM_RESPONSES.get((term_b, term_a), "NO")
+        return response
+    return "NO"
+
+
+# Test fixtures
+@pytest.fixture
+def existing_entities():
+    """Existing entities in the knowledge graph."""
+    return [
+        (
+            "US Food and Drug Administration",
+            MOCK_EMBEDDINGS["us food and drug administration"],
+        ),
+        ("Dupixent", MOCK_EMBEDDINGS["dupixent"]),
+        ("Celebrex", MOCK_EMBEDDINGS["celebrex"]),
+    ]
+
+
+@pytest.fixture
+def config():
+    """Default resolution config."""
+    return EntityResolutionConfig()
+
+
+# Layer 1: Case normalization tests
+class TestCaseNormalization:
+    @pytest.mark.asyncio
+    async def test_exact_match_same_case(self, existing_entities, config):
+        """Exact match with same case."""
+        result = await resolve_entity(
+            "Dupixent",
+            existing_entities,
+            mock_embed_fn,
+            mock_llm_fn,
+            config,
+        )
+        assert result.action == "match"
+        assert result.matched_entity == "Dupixent"
+        assert result.method == "exact"
+        assert result.confidence == 1.0
+
+    @pytest.mark.asyncio
+    async def test_exact_match_different_case(self, existing_entities, config):
+        """DUPIXENT should match Dupixent via case normalization."""
+        result = await resolve_entity(
+            "DUPIXENT",
+            existing_entities,
+            mock_embed_fn,
+            mock_llm_fn,
+            config,
+        )
+        assert result.action == "match"
+        assert result.matched_entity == "Dupixent"
+        assert result.method == "exact"
+
+    @pytest.mark.asyncio
+    async def test_exact_match_lowercase(self, existing_entities, config):
+        """dupixent should match Dupixent."""
+        result = await resolve_entity(
+            "dupixent",
+            existing_entities,
+            mock_embed_fn,
+            mock_llm_fn,
+            config,
+        )
+        assert result.action == "match"
+        assert result.method == "exact"
+
+
+# Layer 2: Fuzzy matching tests
+class TestFuzzyMatching:
+    @pytest.mark.asyncio
+    async def test_fuzzy_match_typo(self, existing_entities, config):
+        """Dupixant (typo) should match Dupixent via fuzzy matching (88%)."""
+        result = await resolve_entity(
+            "Dupixant",
+            existing_entities,
+            mock_embed_fn,
+            mock_llm_fn,
+            config,
+        )
+        assert result.action == "match"
+        assert result.matched_entity == "Dupixent"
+        assert result.method == "fuzzy"
+        assert result.confidence >= 0.85
+
+    @pytest.mark.asyncio
+    async def test_fuzzy_rejects_below_threshold(self, existing_entities, config):
+        """Celebrex vs Cerebyx is 67% - should NOT fuzzy match."""
+        # Add Cerebyx as the query (Celebrex exists)
+        result = await resolve_entity(
+            "Cerebyx",
+            existing_entities,
+            mock_embed_fn,
+            mock_llm_fn,
+            config,
+        )
+        # Should not be fuzzy match (67% < 85%)
+        assert result.method != "fuzzy" or result.action == "new"
+
+
+# Layer 3: LLM verification tests
+class TestLLMVerification:
+    @pytest.mark.asyncio
+    async def test_llm_matches_acronym(self, existing_entities, config):
+        """FDA should match US Food and Drug Administration via LLM."""
+        result = await resolve_entity(
+            "FDA",
+            existing_entities,
+            mock_embed_fn,
+            mock_llm_fn,
+            config,
+        )
+        assert result.action == "match"
+        assert result.matched_entity == "US Food and Drug Administration"
+        assert result.method == "llm"
+
+    @pytest.mark.asyncio
+    async def test_llm_matches_brand_generic(self, config):
+        """Dupixent should match dupilumab via LLM."""
+        existing = [
+            ("dupilumab", MOCK_EMBEDDINGS["dupilumab"]),
+        ]
+        result = await resolve_entity(
+            "Dupixent",
+            existing,
+            mock_embed_fn,
+            mock_llm_fn,
+            config,
+        )
+        assert result.action == "match"
+        assert result.matched_entity == "dupilumab"
+        assert result.method == "llm"
+
+
+# Edge cases
+class TestEdgeCases:
+    @pytest.mark.asyncio
+    async def test_empty_existing_entities(self, config):
+        """New entity when no existing entities."""
+        result = await resolve_entity(
+            "NewEntity",
+            [],
+            mock_embed_fn,
+            mock_llm_fn,
+            config,
+        )
+        assert result.action == "new"
+
+    @pytest.mark.asyncio
+    async def test_disabled_resolution(self, existing_entities):
+        """Resolution disabled returns new."""
+        config = EntityResolutionConfig(enabled=False)
+        result = await resolve_entity(
+            "Dupixent",
+            existing_entities,
+            mock_embed_fn,
+            mock_llm_fn,
+            config,
+        )
+        assert result.action == "new"
+        assert result.method == "disabled"
+
+    @pytest.mark.asyncio
+    async def test_genuinely_new_entity(self, existing_entities, config):
+        """Completely new entity should return 'new'."""
+        result = await resolve_entity(
+            "CompletelyNewDrug",
+            existing_entities,
+            mock_embed_fn,
+            mock_llm_fn,
+            config,
+        )
+        assert result.action == "new"
+        assert result.method == "none"