LightRAG/lightrag/entity_resolution/config.py

"""Configuration for Entity Resolution

Uses the same LLM that LightRAG is configured with - no separate model config needed.
"""

from dataclasses import dataclass, field


@dataclass
class EntityResolutionConfig:
    """Configuration for the entity resolution system."""

    # Whether entity resolution is enabled
    enabled: bool = True

    # Fuzzy pre-resolution: Enable/disable within-batch fuzzy matching before
    # VDB lookup. When enabled, entities in the same batch are matched by string
    # similarity alone. Set to False to skip fuzzy pre-resolution entirely (only
    # exact case-insensitive matches will be accepted within batch; all other
    # resolution goes to VDB/LLM). Disabling reduces false positives but may
    # miss obvious typo corrections.
    fuzzy_pre_resolution_enabled: bool = True

    # Fuzzy string matching threshold (0-1)
    # Above this = auto-match (catches typos like Dupixant/Dupixent at 0.88)
    # Below this = continue to vector search
    # Tuning advice:
    #   0.90+ = Very conservative, near-identical strings (Dupixent/Dupixant)
    #   0.85  = Balanced default, catches typos, avoids most false positives
    #   0.80  = Aggressive, may merge distinct entities with similar names
    #   <0.75 = Not recommended, high false positive risk (Celebrex/Cerebyx=0.67)
    # Test with your domain data; pharmaceutical names need higher thresholds.
    fuzzy_threshold: float = 0.85

    # Vector similarity threshold for finding candidates
    # Low threshold = cast wide net, LLM will verify
    # 0.5 catches FDA/US Food and Drug Administration at 0.67
    vector_threshold: float = 0.5

    # Maximum number of vector candidates to verify with LLM
    # Limits cost - uses same LLM as LightRAG main config
    max_candidates: int = 3

    # LLM verification prompt template
    llm_prompt_template: str = field(
        default="""Are these two terms referring to the same entity?
Consider typos, misspellings, abbreviations, or alternate names.

Term A: {term_a}
Term B: {term_b}

Answer only YES or NO.""",
    )


# Default configuration
DEFAULT_CONFIG = EntityResolutionConfig()