feat: add automatic entity resolution with 3-layer matching

Implement automatic entity resolution to prevent duplicate nodes in the
knowledge graph. The system uses a 3-layer approach:

1. Case-insensitive exact matching (free, instant)
2. Fuzzy string matching >85% threshold (free, instant)
3. Vector similarity + LLM verification (for acronyms/synonyms)

Key features:
- Pre-resolution phase prevents race conditions in parallel processing
- Numeric suffix detection blocks false matches (IL-4 ≠ IL-13)
- PostgreSQL alias cache for fast lookups on subsequent ingestion
- Configurable thresholds via environment variables

Bug fixes included:
- Fix fuzzy matching false positives for numbered entities
- Fix alias cache not being populated (missing db parameter)
- Skip entity_aliases table from generic id index creation

New files:
- lightrag/entity_resolution/ - Core resolution module
- tests/test_entity_resolution/ - Unit tests
- docker/postgres-age-vector/ - Custom PG image with pgvector + AGE
- docker-compose.test.yml - Integration test environment

Configuration (env.example):
- ENTITY_RESOLUTION_ENABLED=true
- ENTITY_RESOLUTION_FUZZY_THRESHOLD=0.85
- ENTITY_RESOLUTION_VECTOR_THRESHOLD=0.5
- ENTITY_RESOLUTION_MAX_CANDIDATES=3
This commit is contained in:
clssck 2025-11-27 12:43:45 +01:00
parent 4f12fe121d
commit 48c7732edc
20 changed files with 1561 additions and 101 deletions

84
docker-compose.test.yml Normal file
View file

@ -0,0 +1,84 @@
name: lightrag-entity-resolution-test
services:
postgres:
container_name: lightrag-postgres
build:
context: ./docker/postgres-age-vector
dockerfile: Dockerfile
environment:
POSTGRES_DB: lightrag
POSTGRES_USER: lightrag
POSTGRES_PASSWORD: lightrag_pass
ports:
- "5433:5432" # Use 5433 to avoid conflict with agent-sdk postgres
volumes:
- pgdata_test:/var/lib/postgresql/data
healthcheck:
test: ["CMD-SHELL", "pg_isready -U lightrag -d lightrag"]
interval: 5s
timeout: 5s
retries: 5
lightrag:
container_name: lightrag-test
build:
context: .
dockerfile: Dockerfile
ports:
- "9622:9621" # Use 9622 to avoid conflict
volumes:
- ./data/rag_storage_test:/app/data/rag_storage
- ./data/inputs_test:/app/data/inputs
environment:
# Server
- HOST=0.0.0.0
- PORT=9621
- LOG_LEVEL=DEBUG
# LLM (OpenAI)
- LLM_BINDING=openai
- LLM_MODEL=gpt-4o-mini
- LLM_BINDING_HOST=https://api.openai.com/v1
- LLM_BINDING_API_KEY=${OPENAI_API_KEY}
# Embedding
- EMBEDDING_BINDING=openai
- EMBEDDING_MODEL=text-embedding-3-small
- EMBEDDING_DIM=1536
- EMBEDDING_BINDING_HOST=https://api.openai.com/v1
- EMBEDDING_BINDING_API_KEY=${OPENAI_API_KEY}
# Storage Configuration - Full PostgreSQL!
# Custom postgres image has pgvector + Apache AGE
- LIGHTRAG_KV_STORAGE=PGKVStorage
- LIGHTRAG_VECTOR_STORAGE=PGVectorStorage
- LIGHTRAG_GRAPH_STORAGE=PGGraphStorage
- LIGHTRAG_DOC_STATUS_STORAGE=PGDocStatusStorage
- POSTGRES_HOST=postgres
- POSTGRES_PORT=5432
- POSTGRES_USER=lightrag
- POSTGRES_PASSWORD=lightrag_pass
- POSTGRES_DATABASE=lightrag
# Entity Resolution - ENABLED!
- ENTITY_RESOLUTION_ENABLED=true
- ENTITY_RESOLUTION_FUZZY_THRESHOLD=0.85
- ENTITY_RESOLUTION_VECTOR_THRESHOLD=0.5
- ENTITY_RESOLUTION_MAX_CANDIDATES=3
# Processing
- MAX_ASYNC=4
- CHUNK_SIZE=1200
depends_on:
postgres:
condition: service_healthy
healthcheck:
test: ["CMD-SHELL", "curl -f http://localhost:9621/health || exit 1"]
interval: 10s
timeout: 5s
retries: 10
start_period: 30s
volumes:
pgdata_test:

View file

@ -0,0 +1,26 @@
# Start from pgvector image (has vector extension pre-built correctly)
FROM pgvector/pgvector:pg17
# Install build dependencies for AGE
RUN apt-get update && apt-get install -y \
build-essential \
git \
postgresql-server-dev-17 \
libreadline-dev \
zlib1g-dev \
flex \
bison \
&& rm -rf /var/lib/apt/lists/*
# Install Apache AGE 1.6.0 for PG17
RUN cd /tmp \
&& git clone --branch release/PG17/1.6.0 https://github.com/apache/age.git \
&& cd age \
&& make \
&& make install \
&& rm -rf /tmp/age
# Add initialization script to create extensions
RUN echo "CREATE EXTENSION IF NOT EXISTS vector;" > /docker-entrypoint-initdb.d/01-vector.sql \
&& echo "CREATE EXTENSION IF NOT EXISTS age;" > /docker-entrypoint-initdb.d/02-age.sql \
&& echo "SET search_path = ag_catalog, public;" >> /docker-entrypoint-initdb.d/02-age.sql

View file

@ -127,6 +127,16 @@ SUMMARY_LANGUAGE=English
### Entity types that the LLM will attempt to recognize
# ENTITY_TYPES='["Person", "Creature", "Organization", "Location", "Event", "Concept", "Method", "Content", "Data", "Artifact", "NaturalObject"]'
###########################################################
### Entity Resolution Configuration
### Automatically deduplicates entities (e.g., "FDA" = "US Food and Drug Administration")
### Uses 3-layer approach: case normalization → fuzzy matching → LLM verification
###########################################################
# ENTITY_RESOLUTION_ENABLED=true
# ENTITY_RESOLUTION_FUZZY_THRESHOLD=0.85
# ENTITY_RESOLUTION_VECTOR_THRESHOLD=0.5
# ENTITY_RESOLUTION_MAX_CANDIDATES=3
### Chunk size for document splitting, 500~1500 is recommended
# CHUNK_SIZE=1200
# CHUNK_OVERLAP_SIZE=100

View file

@ -0,0 +1,93 @@
"""
Quick test for Entity Resolution feature.
Tests that:
1. "FDA" and "US Food and Drug Administration" resolve to the same entity
2. "Dupixant" (typo) matches "Dupixent" via fuzzy matching
"""
import asyncio
import os
import shutil
from lightrag import LightRAG
from lightrag.entity_resolution import EntityResolutionConfig
from lightrag.llm.openai import gpt_4o_mini_complete, openai_embed
from lightrag.utils import logger
import logging
WORKING_DIR = "./test_entity_resolution"
# Test document with entities that should be deduplicated
TEST_DOC = """
The FDA approved Dupixent for treating eczema in 2017.
The US Food and Drug Administration later expanded the drug's indications.
Dupixant (sometimes misspelled) has shown good results in clinical trials.
The FDA continues to monitor the safety of Dupixent.
"""
async def main():
if not os.getenv("OPENAI_API_KEY"):
print("Error: Set OPENAI_API_KEY environment variable")
return
# Clean up previous test
if os.path.exists(WORKING_DIR):
shutil.rmtree(WORKING_DIR)
os.makedirs(WORKING_DIR)
# Set up logging to see resolution messages
logging.basicConfig(level=logging.DEBUG)
logger.setLevel(logging.DEBUG)
print("\n" + "=" * 60)
print("Entity Resolution Test")
print("=" * 60)
rag = LightRAG(
working_dir=WORKING_DIR,
embedding_func=openai_embed,
llm_model_func=gpt_4o_mini_complete,
entity_resolution_config=EntityResolutionConfig(
enabled=True,
fuzzy_threshold=0.85,
vector_threshold=0.5,
max_candidates=3,
),
)
await rag.initialize_storages()
print("\nInserting test document...")
print(f"Document: {TEST_DOC.strip()}")
print("\n" + "-" * 60)
await rag.ainsert(TEST_DOC)
print("\n" + "-" * 60)
print("Checking extracted entities...")
# Read the graph to see what entities were created
graph_file = os.path.join(WORKING_DIR, "graph_chunk_entity_relation.graphml")
if os.path.exists(graph_file):
import networkx as nx
G = nx.read_graphml(graph_file)
print(f"\nEntities in graph ({len(G.nodes())} total):")
for node in sorted(G.nodes()):
print(f" - {node}")
print(f"\nRelationships: {len(G.edges())}")
else:
print("Graph file not found")
await rag.finalize_storages()
print("\n" + "=" * 60)
print("Test complete!")
print("=" * 60)
if __name__ == "__main__":
asyncio.run(main())

View file

@ -450,6 +450,20 @@ def parse_args() -> argparse.Namespace:
"EMBEDDING_TOKEN_LIMIT", None, int, special_none=True
)
# Entity Resolution configuration
args.entity_resolution_enabled = get_env_value(
"ENTITY_RESOLUTION_ENABLED", False, bool
)
args.entity_resolution_fuzzy_threshold = get_env_value(
"ENTITY_RESOLUTION_FUZZY_THRESHOLD", 0.85, float
)
args.entity_resolution_vector_threshold = get_env_value(
"ENTITY_RESOLUTION_VECTOR_THRESHOLD", 0.5, float
)
args.entity_resolution_max_candidates = get_env_value(
"ENTITY_RESOLUTION_MAX_CANDIDATES", 3, int
)
ollama_server_infos.LIGHTRAG_NAME = args.simulated_model_name
ollama_server_infos.LIGHTRAG_TAG = args.simulated_model_tag

View file

@ -2,6 +2,8 @@
LightRAG FastAPI Server
"""
from __future__ import annotations
from fastapi import FastAPI, Depends, HTTPException, Request
from fastapi.exceptions import RequestValidationError
from fastapi.responses import JSONResponse
@ -642,6 +644,23 @@ def create_app(args):
raise Exception(f"Failed to import {binding} options: {e}")
return {}
def create_entity_resolution_config(args) -> object | None:
"""
Create EntityResolutionConfig from command line/env arguments.
Returns None if entity resolution is disabled.
"""
if not args.entity_resolution_enabled:
return None
from lightrag.entity_resolution import EntityResolutionConfig
return EntityResolutionConfig(
enabled=True,
fuzzy_threshold=args.entity_resolution_fuzzy_threshold,
vector_threshold=args.entity_resolution_vector_threshold,
max_candidates=args.entity_resolution_max_candidates,
)
def create_optimized_embedding_function(
config_cache: LLMConfigCache, binding, model, host, api_key, args
) -> EmbeddingFunc:
@ -1029,6 +1048,7 @@ def create_app(args):
"entity_types": args.entity_types,
},
ollama_server_infos=ollama_server_infos,
entity_resolution_config=create_entity_resolution_config(args),
)
except Exception as e:
logger.error(f"Failed to initialize LightRAG: {e}")

View file

@ -443,7 +443,7 @@ class DocStatusResponse(BaseModel):
metadata: Optional[dict[str, Any]] = Field(
default=None, description="Additional metadata about the document"
)
file_path: str = Field(description="Path to the document file")
file_path: Optional[str] = Field(default=None, description="Path to the document file")
class Config:
json_schema_extra = {

View file

@ -0,0 +1,29 @@
"""
Entity Resolution Module for LightRAG
Provides automatic entity deduplication using a 3-layer approach:
1. Case normalization (exact match)
2. Fuzzy string matching (typos)
3. Vector similarity + LLM verification (semantic matches)
"""
from .resolver import (
resolve_entity,
resolve_entity_with_vdb,
ResolutionResult,
get_cached_alias,
store_alias,
fuzzy_similarity,
)
from .config import EntityResolutionConfig, DEFAULT_CONFIG
__all__ = [
"resolve_entity",
"resolve_entity_with_vdb",
"ResolutionResult",
"EntityResolutionConfig",
"DEFAULT_CONFIG",
"get_cached_alias",
"store_alias",
"fuzzy_similarity",
]

View file

@ -0,0 +1,57 @@
"""Configuration for Entity Resolution
Uses the same LLM that LightRAG is configured with - no separate model config needed.
"""
from dataclasses import dataclass, field
@dataclass
class EntityResolutionConfig:
"""Configuration for the entity resolution system."""
# Whether entity resolution is enabled
enabled: bool = True
# Fuzzy pre-resolution: Enable/disable within-batch fuzzy matching before
# VDB lookup. When enabled, entities in the same batch are matched by string
# similarity alone. Set to False to skip fuzzy pre-resolution entirely (only
# exact case-insensitive matches will be accepted within batch; all other
# resolution goes to VDB/LLM). Disabling reduces false positives but may
# miss obvious typo corrections.
fuzzy_pre_resolution_enabled: bool = True
# Fuzzy string matching threshold (0-1)
# Above this = auto-match (catches typos like Dupixant/Dupixent at 0.88)
# Below this = continue to vector search
# Tuning advice:
# 0.90+ = Very conservative, near-identical strings (Dupixent/Dupixant)
# 0.85 = Balanced default, catches typos, avoids most false positives
# 0.80 = Aggressive, may merge distinct entities with similar names
# <0.75 = Not recommended, high false positive risk (Celebrex/Cerebyx=0.67)
# Test with your domain data; pharmaceutical names need higher thresholds.
fuzzy_threshold: float = 0.85
# Vector similarity threshold for finding candidates
# Low threshold = cast wide net, LLM will verify
# 0.5 catches FDA/US Food and Drug Administration at 0.67
vector_threshold: float = 0.5
# Maximum number of vector candidates to verify with LLM
# Limits cost - uses same LLM as LightRAG main config
max_candidates: int = 3
# LLM verification prompt template
llm_prompt_template: str = field(
default="""Are these two terms referring to the same entity?
Consider typos, misspellings, abbreviations, or alternate names.
Term A: {term_a}
Term B: {term_b}
Answer only YES or NO.""",
)
# Default configuration
DEFAULT_CONFIG = EntityResolutionConfig()

View file

@ -0,0 +1,335 @@
"""Entity Resolution - 3-Layer Approach
Layer 1: Case normalization (exact match)
Layer 2: Fuzzy string matching (>85% = typos)
Layer 3: Vector similarity + LLM verification (semantic matches)
Uses the same LLM that LightRAG is configured with.
"""
from collections.abc import Awaitable, Callable
from dataclasses import dataclass
from difflib import SequenceMatcher
import numpy as np
from lightrag.utils import logger
from .config import DEFAULT_CONFIG, EntityResolutionConfig
@dataclass
class ResolutionResult:
"""Result of entity resolution attempt."""
action: str # "match" | "new"
matched_entity: str | None
confidence: float
method: str # "exact" | "fuzzy" | "llm" | "none" | "disabled"
def cosine_similarity(a: list[float], b: list[float]) -> float:
"""Calculate cosine similarity between two vectors."""
a_arr, b_arr = np.array(a), np.array(b)
norm_a, norm_b = np.linalg.norm(a_arr), np.linalg.norm(b_arr)
if norm_a == 0 or norm_b == 0:
return 0.0
return float(np.dot(a_arr, b_arr) / (norm_a * norm_b))
def fuzzy_similarity(a: str, b: str) -> float:
"""Calculate fuzzy string similarity (0-1)."""
return SequenceMatcher(None, a.lower().strip(), b.lower().strip()).ratio()
def find_vector_candidates(
query_embedding: list[float],
existing_entities: list[tuple[str, list[float]]],
threshold: float,
) -> list[tuple[str, float]]:
"""Find entities with vector similarity above threshold."""
candidates = []
for name, embedding in existing_entities:
sim = cosine_similarity(query_embedding, embedding)
if sim >= threshold:
candidates.append((name, sim))
# Sort by similarity descending
candidates.sort(key=lambda x: x[1], reverse=True)
return candidates
async def llm_verify(
term_a: str,
term_b: str,
llm_fn: Callable[[str], Awaitable[str]],
prompt_template: str,
) -> bool:
"""Ask LLM if two terms refer to the same entity.
Uses strict parsing with exact token matching only. Accepted responses:
- Positive: "YES", "TRUE", "SAME", "MATCH"
- Negative: "NO", "FALSE", "DIFFERENT", "NOT SAME"
Any other response defaults to False to avoid false positive merges.
"""
prompt = prompt_template.format(term_a=term_a, term_b=term_b)
response = await llm_fn(prompt)
# Normalize response: strip whitespace, take first line only
normalized = response.strip().split("\n")[0].strip().upper()
# Remove common trailing punctuation
normalized = normalized.rstrip(".!,")
# Only accept exact tokens (no prefix/substring matching)
if normalized in ("YES", "TRUE", "SAME", "MATCH"):
return True
if normalized in ("NO", "FALSE", "DIFFERENT", "NOT SAME"):
return False
# Default to False for ambiguous responses (safer than false positive)
return False
async def resolve_entity(
entity_name: str,
existing_entities: list[tuple[str, list[float]]],
embed_fn: Callable[[str], Awaitable[list[float]]],
llm_fn: Callable[[str], Awaitable[str]],
config: EntityResolutionConfig = DEFAULT_CONFIG,
) -> ResolutionResult:
"""Resolve an entity against existing entities using 3-layer approach.
Args:
entity_name: The new entity name to resolve
existing_entities: List of (name, embedding) tuples for existing entities
embed_fn: Async function to get embedding for a string (same as LightRAG uses)
llm_fn: Async function to query LLM (same as LightRAG uses)
config: Resolution configuration
Returns:
ResolutionResult with action ("match" or "new"), matched entity,
confidence, and method used.
"""
if not config.enabled:
return ResolutionResult("new", None, 0.0, "disabled")
if not existing_entities:
return ResolutionResult("new", None, 0.0, "none")
normalized = entity_name.lower().strip()
# Layer 1: Case-insensitive exact match
for name, _ in existing_entities:
if name.lower().strip() == normalized:
return ResolutionResult("match", name, 1.0, "exact")
# Layer 2: Fuzzy string matching (catches typos)
best_fuzzy_match = None
best_fuzzy_score = 0.0
for name, _ in existing_entities:
similarity = fuzzy_similarity(entity_name, name)
if similarity > best_fuzzy_score:
best_fuzzy_score = similarity
best_fuzzy_match = name
if best_fuzzy_score >= config.fuzzy_threshold:
return ResolutionResult("match", best_fuzzy_match, best_fuzzy_score, "fuzzy")
# Layer 3: Vector similarity + LLM verification
embedding = await embed_fn(entity_name)
candidates = find_vector_candidates(
embedding,
existing_entities,
config.vector_threshold,
)
# Verify top candidates with LLM
for candidate_name, similarity in candidates[: config.max_candidates]:
is_same = await llm_verify(
entity_name,
candidate_name,
llm_fn,
config.llm_prompt_template,
)
if is_same:
return ResolutionResult("match", candidate_name, similarity, "llm")
# No match found - this is a new entity
return ResolutionResult("new", None, 0.0, "none")
async def resolve_entity_with_vdb(
entity_name: str,
entity_vdb, # BaseVectorStorage - imported dynamically to avoid circular imports
llm_fn: Callable[[str], Awaitable[str]],
config: EntityResolutionConfig = DEFAULT_CONFIG,
) -> ResolutionResult:
"""Resolve an entity using VDB for similarity search.
This is the production integration that uses LightRAG's vector database
directly instead of requiring pre-computed embeddings.
Args:
entity_name: The new entity name to resolve
entity_vdb: LightRAG's entity vector database (BaseVectorStorage)
llm_fn: Async function to query LLM (same as LightRAG uses)
config: Resolution configuration
Returns:
ResolutionResult with action ("match" or "new"), matched entity,
confidence, and method used.
"""
if not config.enabled:
return ResolutionResult("new", None, 0.0, "disabled")
if entity_vdb is None:
return ResolutionResult("new", None, 0.0, "none")
normalized = entity_name.lower().strip()
# Query VDB for similar entities - cast wide net, LLM will verify
# top_k is doubled to have enough candidates after filtering
try:
candidates = await entity_vdb.query(
entity_name, top_k=config.max_candidates * 3
)
except Exception as e:
# Log and skip resolution if VDB query fails
logger.debug(f"VDB query failed for '{entity_name}': {e}")
return ResolutionResult("new", None, 0.0, "none")
if not candidates:
return ResolutionResult("new", None, 0.0, "none")
# Layer 1: Case-insensitive exact match among candidates
for candidate in candidates:
candidate_name = candidate.get("entity_name")
if candidate_name and candidate_name.lower().strip() == normalized:
return ResolutionResult("match", candidate_name, 1.0, "exact")
# Layer 2: Fuzzy string matching (catches typos)
best_fuzzy_match = None
best_fuzzy_score = 0.0
for candidate in candidates:
candidate_name = candidate.get("entity_name")
if not candidate_name:
continue
similarity = fuzzy_similarity(entity_name, candidate_name)
if similarity > best_fuzzy_score:
best_fuzzy_score = similarity
best_fuzzy_match = candidate_name
if best_fuzzy_score >= config.fuzzy_threshold:
return ResolutionResult("match", best_fuzzy_match, best_fuzzy_score, "fuzzy")
# Layer 3: LLM verification on top candidates
verified_count = 0
for candidate in candidates:
if verified_count >= config.max_candidates:
break
candidate_name = candidate.get("entity_name")
if not candidate_name:
continue
is_same = await llm_verify(
entity_name,
candidate_name,
llm_fn,
config.llm_prompt_template,
)
verified_count += 1
if is_same:
# Use distance from VDB if available (converted to similarity)
similarity = 0.7 # Default confidence for LLM match
return ResolutionResult("match", candidate_name, similarity, "llm")
# No match found - this is a new entity
return ResolutionResult("new", None, 0.0, "none")
# --- Alias Cache Functions (PostgreSQL) ---
async def get_cached_alias(
alias: str,
db, # PostgresDB instance
workspace: str,
) -> tuple[str, str, float] | None:
"""Check if alias is already resolved in cache.
Args:
alias: The entity name to look up
db: PostgresDB instance with query method
workspace: Workspace for isolation
Returns:
Tuple of (canonical_entity, method, confidence) if found, None otherwise
"""
import logging
from lightrag.kg.postgres_impl import SQL_TEMPLATES
logger = logging.getLogger(__name__)
normalized_alias = alias.lower().strip()
sql = SQL_TEMPLATES["get_alias"]
try:
result = await db.query(sql, params=[workspace, normalized_alias])
if result:
return (
result["canonical_entity"],
result["method"],
result["confidence"],
)
except Exception as e:
logger.debug(f"Alias cache lookup error: {e}")
return None
async def store_alias(
alias: str,
canonical: str,
method: str,
confidence: float,
db, # PostgresDB instance
workspace: str,
) -> None:
"""Store a resolution in the alias cache.
Args:
alias: The variant name (e.g., "FDA")
canonical: The resolved canonical name (e.g., "US Food and Drug Administration")
method: How it was resolved ('exact', 'fuzzy', 'llm', 'manual')
confidence: Resolution confidence (0-1)
db: PostgresDB instance with execute method
workspace: Workspace for isolation
"""
import logging
from datetime import datetime, timezone
from lightrag.kg.postgres_impl import SQL_TEMPLATES
logger = logging.getLogger(__name__)
normalized_alias = alias.lower().strip()
# Don't store self-referential aliases (e.g., "FDA" → "FDA")
if normalized_alias == canonical.lower().strip():
return
sql = SQL_TEMPLATES["upsert_alias"]
try:
await db.execute(
sql,
data={
"workspace": workspace,
"alias": normalized_alias,
"canonical_entity": canonical,
"method": method,
"confidence": confidence,
"create_time": datetime.now(timezone.utc).replace(tzinfo=None),
},
)
except Exception as e:
logger.debug(f"Alias cache store error: {e}")

View file

@ -1130,7 +1130,14 @@ class PostgreSQLDB:
existing_indexes = {row["indexname"] for row in existing_indexes_result}
# Create missing indexes
# Tables that don't have an 'id' column (use different primary key structure)
tables_without_id = {"LIGHTRAG_ENTITY_ALIASES"}
for k in table_names:
# Skip tables that don't have an 'id' column
if k in tables_without_id:
continue
# Create index for id column if missing
index_name = f"idx_{k.lower()}_id"
if index_name not in existing_indexes:
@ -1259,6 +1266,12 @@ class PostgreSQLDB:
f"PostgreSQL, Failed to create full entities/relations tables: {e}"
)
# Migrate entity aliases table to add update_time, index on canonical_entity, and confidence constraint
try:
await self._migrate_entity_aliases_schema()
except Exception as e:
logger.error(f"PostgreSQL, Failed to migrate entity aliases schema: {e}")
async def _migrate_create_full_entities_relations_tables(self):
"""Create LIGHTRAG_FULL_ENTITIES and LIGHTRAG_FULL_RELATIONS tables if they don't exist"""
tables_to_check = [
@ -1323,6 +1336,127 @@ class PostgreSQLDB:
except Exception as e:
logger.error(f"Failed to create table {table_name}: {e}")
async def _migrate_entity_aliases_schema(self):
"""Migrate LIGHTRAG_ENTITY_ALIASES table to add update_time column, canonical index, and confidence constraint"""
table_name = "LIGHTRAG_ENTITY_ALIASES"
# Check if table exists first
check_table_sql = """
SELECT table_name
FROM information_schema.tables
WHERE table_name = $1
AND table_schema = 'public'
"""
table_exists = await self.query(check_table_sql, [table_name.lower()])
if not table_exists:
logger.debug(f"Table {table_name} does not exist yet, skipping migration")
return
# 1. Add update_time column if it doesn't exist
check_column_sql = """
SELECT column_name
FROM information_schema.columns
WHERE table_name = $1
AND column_name = 'update_time'
AND table_schema = 'public'
"""
column_exists = await self.query(check_column_sql, [table_name.lower()])
if not column_exists:
try:
# Three-step migration to add update_time column:
# 1. Add column WITHOUT default - avoids full table rewrite on large tables
# 2. Backfill existing rows with create_time values
# 3. Set DEFAULT for future inserts
# Note: There's a tiny race window between steps 1-3 where concurrent
# inserts could get NULL. This is acceptable for this migration use case.
#
# Step 1: Add column WITHOUT default (existing rows get NULL)
add_column_sql = f"""
ALTER TABLE {table_name}
ADD COLUMN update_time TIMESTAMP(0)
"""
await self.execute(add_column_sql)
logger.info(f"PostgreSQL, Added update_time column to {table_name}")
# Step 2: Set existing rows' update_time to their create_time
update_sql = f"""
UPDATE {table_name}
SET update_time = create_time
WHERE update_time IS NULL
"""
await self.execute(update_sql)
logger.info(
f"PostgreSQL, Initialized update_time values in {table_name}"
)
# Step 3: Set default for future rows
set_default_sql = f"""
ALTER TABLE {table_name}
ALTER COLUMN update_time SET DEFAULT CURRENT_TIMESTAMP
"""
await self.execute(set_default_sql)
logger.info(
f"PostgreSQL, Set default for update_time column in {table_name}"
)
except Exception as e:
logger.error(
f"PostgreSQL, Failed to add update_time column to {table_name}: {e}"
)
# 2. Create index on (workspace, canonical_entity) for get_aliases_for_canonical query
index_name = "idx_lightrag_entity_aliases_canonical"
check_index_sql = """
SELECT indexname
FROM pg_indexes
WHERE tablename = $1
AND indexname = $2
"""
index_exists = await self.query(
check_index_sql, [table_name.lower(), index_name]
)
if not index_exists:
try:
create_index_sql = f"""
CREATE INDEX {index_name}
ON {table_name} (workspace, canonical_entity)
"""
await self.execute(create_index_sql)
logger.info(f"PostgreSQL, Created index {index_name} on {table_name}")
except Exception as e:
logger.error(f"PostgreSQL, Failed to create index {index_name}: {e}")
# 3. Add CHECK constraint for confidence range if it doesn't exist
constraint_name = "confidence_range"
check_constraint_sql = """
SELECT constraint_name
FROM information_schema.table_constraints
WHERE table_name = $1
AND constraint_name = $2
AND constraint_type = 'CHECK'
AND table_schema = 'public'
"""
constraint_exists = await self.query(
check_constraint_sql, [table_name.lower(), constraint_name]
)
if not constraint_exists:
try:
add_constraint_sql = f"""
ALTER TABLE {table_name}
ADD CONSTRAINT {constraint_name}
CHECK (confidence >= 0 AND confidence <= 1)
"""
await self.execute(add_constraint_sql)
logger.info(
f"PostgreSQL, Added CHECK constraint {constraint_name} to {table_name}"
)
except Exception as e:
logger.warning(
f"PostgreSQL, Failed to add CHECK constraint {constraint_name} to {table_name}: {e}"
)
async def _create_pagination_indexes(self):
"""Create indexes to optimize pagination queries for LIGHTRAG_DOC_STATUS"""
indexes = [
@ -1402,7 +1536,7 @@ class PostgreSQLDB:
"VCHORDRQ": f"""
CREATE INDEX {{vector_index_name}}
ON {{k}} USING vchordrq (content_vector vector_cosine_ops)
{f'WITH (options = $${self.vchordrq_build_options}$$)' if self.vchordrq_build_options else ''}
{f"WITH (options = $${self.vchordrq_build_options}$$)" if self.vchordrq_build_options else ""}
""",
}
@ -4906,6 +5040,19 @@ TABLES = {
CONSTRAINT LIGHTRAG_RELATION_CHUNKS_PK PRIMARY KEY (workspace, id)
)"""
},
"LIGHTRAG_ENTITY_ALIASES": {
"ddl": """CREATE TABLE LIGHTRAG_ENTITY_ALIASES (
workspace VARCHAR(255),
alias VARCHAR(512),
canonical_entity VARCHAR(512),
method VARCHAR(50),
confidence FLOAT,
create_time TIMESTAMP(0) DEFAULT CURRENT_TIMESTAMP,
update_time TIMESTAMP(0) DEFAULT CURRENT_TIMESTAMP,
CONSTRAINT LIGHTRAG_ENTITY_ALIASES_PK PRIMARY KEY (workspace, alias),
CONSTRAINT confidence_range CHECK (confidence >= 0 AND confidence <= 1)
)"""
},
}
@ -5117,4 +5264,25 @@ SQL_TEMPLATES = {
"drop_specifiy_table_workspace": """
DELETE FROM {table_name} WHERE workspace=$1
""",
# Entity alias cache
"get_alias": """
SELECT canonical_entity, method, confidence
FROM LIGHTRAG_ENTITY_ALIASES
WHERE workspace=$1 AND alias=$2
""",
"upsert_alias": """
INSERT INTO LIGHTRAG_ENTITY_ALIASES
(workspace, alias, canonical_entity, method, confidence, create_time, update_time)
VALUES ($1, $2, $3, $4, $5, $6, $6)
ON CONFLICT (workspace, alias) DO UPDATE SET
canonical_entity = EXCLUDED.canonical_entity,
method = EXCLUDED.method,
confidence = EXCLUDED.confidence,
update_time = CURRENT_TIMESTAMP
""",
"get_aliases_for_canonical": """
SELECT alias, method, confidence
FROM LIGHTRAG_ENTITY_ALIASES
WHERE workspace=$1 AND canonical_entity=$2
""",
}

View file

@ -26,6 +26,7 @@ from typing import (
)
from lightrag.prompt import PROMPTS
from lightrag.exceptions import PipelineCancelledException
from lightrag.entity_resolution import EntityResolutionConfig
from lightrag.constants import (
DEFAULT_MAX_GLEANING,
DEFAULT_FORCE_LLM_SUMMARY_ON_MERGE,
@ -217,6 +218,11 @@ class LightRAG:
)
)
entity_resolution_config: EntityResolutionConfig | None = field(default=None)
"""Configuration for entity resolution (deduplication).
Set to EntityResolutionConfig() to enable, or None to disable.
Resolves entities like 'FDA' 'US Food and Drug Administration'."""
# Text chunking
# ---

View file

@ -5,6 +5,7 @@ from pathlib import Path
import asyncio
import json
import json_repair
import re
from typing import Any, AsyncIterator, overload, Literal
from collections import Counter, defaultdict
@ -50,6 +51,13 @@ from lightrag.base import (
QueryContextResult,
)
from lightrag.prompt import PROMPTS
from lightrag.entity_resolution import (
resolve_entity_with_vdb,
get_cached_alias,
store_alias,
fuzzy_similarity,
EntityResolutionConfig,
)
from lightrag.constants import (
GRAPH_FIELD_SEP,
DEFAULT_MAX_ENTITY_TOKENS,
@ -1590,6 +1598,180 @@ async def _rebuild_single_relationship(
pipeline_status["history_messages"].append(status_message)
def _has_different_numeric_suffix(name_a: str, name_b: str) -> bool:
"""Check if two names have different numeric components.
This prevents false fuzzy matches between entities that differ only by number,
such as "Interleukin-4" vs "Interleukin-13" (88.9% similar but semantically distinct).
Scientific/medical entities often use numbers as key identifiers:
- Interleukins: IL-4, IL-13, IL-17
- Drug phases: Phase 1, Phase 2, Phase 3
- Receptor types: Type 1, Type 2
- Versions: v1.0, v2.0
Args:
name_a: First entity name
name_b: Second entity name
Returns:
True if both names contain numbers but the numbers differ, False otherwise.
"""
# Extract all numeric patterns (integers and decimals)
pattern = r"(\d+(?:\.\d+)?)"
nums_a = re.findall(pattern, name_a)
nums_b = re.findall(pattern, name_b)
# If both have numbers and they differ, these are likely distinct entities
if nums_a and nums_b and nums_a != nums_b:
return True
return False
async def _build_pre_resolution_map(
entity_names: list[str],
entity_types: dict[str, str],
entity_vdb,
llm_fn,
config: EntityResolutionConfig,
) -> tuple[dict[str, str], dict[str, float]]:
"""Build resolution map before parallel processing to prevent race conditions.
This function resolves entities against each other within the batch (using
instant fuzzy matching) and against existing VDB entries. The resulting map
is applied during parallel entity processing.
Args:
entity_names: List of entity names to resolve
entity_types: Dict mapping entity names to their types (e.g., "person", "organization").
Used to prevent fuzzy matching between entities of different types.
entity_vdb: Entity vector database for checking existing entities
llm_fn: LLM function for semantic verification
config: Entity resolution configuration
Returns:
Tuple of:
- resolution_map: Dict mapping original entity names to their resolved canonical names.
Only entities that need remapping are included.
- confidence_map: Dict mapping alias to confidence score (1.0 for exact, actual
similarity for fuzzy, result.confidence for VDB matches).
"""
resolution_map: dict[str, str] = {}
confidence_map: dict[str, float] = {}
# Track canonical entities with their types: [(name, type), ...]
canonical_entities: list[tuple[str, str]] = []
for entity_name in entity_names:
normalized = entity_name.lower().strip()
entity_type = entity_types.get(entity_name, "")
# Skip if already resolved to something in this batch
if entity_name in resolution_map:
continue
# Layer 1: Case-insensitive exact match within batch
matched = False
for canonical, canonical_type in canonical_entities:
if canonical.lower().strip() == normalized:
resolution_map[entity_name] = canonical
confidence_map[entity_name] = 1.0 # Exact match = perfect confidence
logger.debug(
f"Pre-resolution (case match): '{entity_name}''{canonical}'"
)
matched = True
break
if matched:
continue
# Layer 2: Fuzzy match within batch (catches typos like Dupixant→Dupixent)
# Only enabled when config.fuzzy_pre_resolution_enabled is True.
# Requires: similarity >= threshold AND matching types (or unknown).
if config.fuzzy_pre_resolution_enabled:
for canonical, canonical_type in canonical_entities:
similarity = fuzzy_similarity(entity_name, canonical)
if similarity >= config.fuzzy_threshold:
# Type compatibility check: skip if types differ and both known.
# Empty/unknown types are treated as compatible to avoid
# blocking legitimate matches when type info is incomplete.
types_compatible = (
not entity_type
or not canonical_type
or entity_type == canonical_type
)
if not types_compatible:
logger.debug(
f"Pre-resolution (fuzzy {similarity:.2f}): SKIPPED "
f"'{entity_name}' ({entity_type}) → "
f"'{canonical}' ({canonical_type}) - type mismatch"
)
continue
# Numeric suffix check: skip if names have different numbers
# This prevents false matches like "Interleukin-4" → "Interleukin-13"
# where fuzzy similarity is high (88.9%) but entities are distinct
if _has_different_numeric_suffix(entity_name, canonical):
logger.debug(
f"Pre-resolution (fuzzy {similarity:.2f}): SKIPPED "
f"'{entity_name}''{canonical}' - different numeric suffix"
)
continue
# Accept the fuzzy match - emit warning for review
resolution_map[entity_name] = canonical
confidence_map[entity_name] = (
similarity # Use actual similarity score
)
etype_display = entity_type or "unknown"
ctype_display = canonical_type or "unknown"
logger.warning(
f"Fuzzy pre-resolution accepted: '{entity_name}'"
f"'{canonical}' (similarity={similarity:.3f}, "
f"types: {etype_display}{ctype_display}). "
f"Review for correctness; adjust fuzzy_threshold or "
f"disable fuzzy_pre_resolution_enabled if needed."
)
matched = True
break
if matched:
continue
# Layer 3: Check existing VDB for cross-document deduplication
if entity_vdb and llm_fn:
try:
result = await resolve_entity_with_vdb(
entity_name, entity_vdb, llm_fn, config
)
if result.action == "match" and result.matched_entity:
resolution_map[entity_name] = result.matched_entity
confidence_map[entity_name] = (
result.confidence
) # Use VDB result confidence
# Add canonical from VDB so batch entities can match it.
# VDB matches don't have type info available, use empty.
canonical_entities.append((result.matched_entity, ""))
logger.debug(
f"Pre-resolution (VDB {result.method}): "
f"'{entity_name}''{result.matched_entity}'"
)
continue
except Exception as e:
logger.debug(
f"Pre-resolution VDB check failed for '{entity_name}': {e}"
)
# No match found - this is a new canonical entity
canonical_entities.append((entity_name, entity_type))
if resolution_map:
logger.info(
f"Pre-resolution: {len(resolution_map)} entities mapped to canonical forms"
)
return resolution_map, confidence_map
async def _merge_nodes_then_upsert(
entity_name: str,
nodes_data: list[dict],
@ -1600,8 +1782,128 @@ async def _merge_nodes_then_upsert(
pipeline_status_lock=None,
llm_response_cache: BaseKVStorage | None = None,
entity_chunks_storage: BaseKVStorage | None = None,
):
"""Get existing nodes from knowledge graph use name,if exists, merge data, else create, then upsert."""
pre_resolution_map: dict[str, str] | None = None,
) -> tuple[dict, str | None]:
"""Get existing nodes from knowledge graph use name,if exists, merge data, else create, then upsert.
Returns:
Tuple of (node_data, original_entity_name). original_entity_name is set if
entity resolution changed the name (e.g., "Dupixant" "Dupixent"),
otherwise None.
"""
original_entity_name = entity_name # Track original before resolution
# Apply pre-resolution map immediately (prevents race conditions in parallel processing)
pre_resolved = False
if pre_resolution_map and entity_name in pre_resolution_map:
entity_name = pre_resolution_map[entity_name]
pre_resolved = True
logger.debug(
f"Applied pre-resolution: '{original_entity_name}''{entity_name}'"
)
# Entity Resolution: Resolve new entity against existing entities
# Skip if already pre-resolved (to avoid redundant VDB queries)
entity_resolution_config_raw = global_config.get("entity_resolution_config")
entity_resolution_config = None
if entity_resolution_config_raw:
# Handle both dict (from asdict() serialization) and EntityResolutionConfig instances
if isinstance(entity_resolution_config_raw, EntityResolutionConfig):
entity_resolution_config = entity_resolution_config_raw
elif isinstance(entity_resolution_config_raw, dict):
try:
entity_resolution_config = EntityResolutionConfig(
**entity_resolution_config_raw
)
except TypeError as e:
logger.warning(
f"Invalid entity_resolution_config: {e}. "
f"Config: {entity_resolution_config_raw}. Skipping resolution."
)
# Safely check if entity resolution is enabled, handling both object and dict forms
def _is_resolution_enabled(config) -> bool:
if config is None:
return False
if isinstance(config, dict):
return config.get("enabled", False)
return getattr(config, "enabled", False)
# Skip VDB resolution if entity was already pre-resolved (prevents redundant queries)
if (
_is_resolution_enabled(entity_resolution_config)
and entity_vdb
and not pre_resolved
):
original_name = entity_name
workspace = global_config.get("workspace", "")
# Try knowledge_graph_inst.db first (more reliable), fallback to entity_vdb.db
db = getattr(knowledge_graph_inst, "db", None) or getattr(
entity_vdb, "db", None
)
# Layer 0: Check alias cache first (PostgreSQL-only - requires db connection)
# Note: Alias caching is only available when using PostgreSQL storage backend
if db is not None:
try:
cached = await get_cached_alias(original_name, db, workspace)
if cached:
canonical, method, _ = cached
logger.debug(
f"Alias cache hit: '{original_name}''{canonical}' "
f"(method: {method})"
)
entity_name = canonical
except Exception as e:
logger.warning(
f"Entity resolution cache lookup failed for '{original_name}' "
f"(workspace: {workspace}): {type(e).__name__}: {e}. "
"Continuing without cache."
)
# Layers 1-3: Full VDB resolution (if not found in cache)
if entity_name == original_name:
llm_fn = global_config.get("llm_model_func")
if llm_fn:
try:
resolution = await resolve_entity_with_vdb(
entity_name,
entity_vdb,
llm_fn,
entity_resolution_config,
)
if resolution.action == "match" and resolution.matched_entity:
logger.info(
f"Entity resolution: '{entity_name}''{resolution.matched_entity}' "
f"(method: {resolution.method}, confidence: {resolution.confidence:.2f})"
)
entity_name = resolution.matched_entity
# Store in alias cache for next time (PostgreSQL-only)
# Note: Alias caching requires PostgreSQL storage backend
if db is not None:
try:
await store_alias(
original_name,
entity_name,
resolution.method,
resolution.confidence,
db,
workspace,
)
except Exception as e:
logger.warning(
f"Failed to store entity alias '{original_name}''{entity_name}' "
f"(workspace: {workspace}): {type(e).__name__}: {e}. "
"Resolution succeeded but cache not updated."
)
except Exception as e:
logger.warning(
f"Entity resolution failed for '{original_name}' "
f"(workspace: {workspace}): {type(e).__name__}: {e}. "
"Continuing with original entity name."
)
already_entity_types = []
already_source_ids = []
already_description = []
@ -1865,7 +2167,12 @@ async def _merge_nodes_then_upsert(
max_retries=3,
retry_delay=0.1,
)
return node_data
# Return original name if resolution changed it, None otherwise
resolved_from = (
original_entity_name if entity_name != original_entity_name else None
)
return node_data, resolved_from
async def _merge_edges_then_upsert(
@ -1882,7 +2189,13 @@ async def _merge_edges_then_upsert(
added_entities: list = None, # New parameter to track entities added during edge processing
relation_chunks_storage: BaseKVStorage | None = None,
entity_chunks_storage: BaseKVStorage | None = None,
entity_resolution_map: dict[str, str] | None = None, # Map original→resolved names
):
# Apply entity resolution mapping to edge endpoints
if entity_resolution_map:
src_id = entity_resolution_map.get(src_id, src_id)
tgt_id = entity_resolution_map.get(tgt_id, tgt_id)
if src_id == tgt_id:
return None
@ -2472,6 +2785,76 @@ async def merge_nodes_and_edges(
graph_max_async = global_config.get("llm_model_max_async", 4) * 2
semaphore = asyncio.Semaphore(graph_max_async)
# ===== Pre-Resolution Phase: Build entity resolution map =====
# This prevents race conditions when parallel workers process similar entities
# IMPORTANT: Include BOTH entity names AND relation endpoints to catch all duplicates
pre_resolution_map: dict[str, str] = {}
entity_resolution_config_raw = global_config.get("entity_resolution_config")
if entity_resolution_config_raw:
# Handle both dict (from asdict() serialization) and EntityResolutionConfig instances
config = None
if isinstance(entity_resolution_config_raw, EntityResolutionConfig):
config = entity_resolution_config_raw
elif isinstance(entity_resolution_config_raw, dict):
try:
config = EntityResolutionConfig(**entity_resolution_config_raw)
except TypeError as e:
logger.warning(
f"Invalid entity_resolution_config: {e}. "
f"Config: {entity_resolution_config_raw}. Skipping resolution."
)
if config and config.enabled:
llm_fn = global_config.get("llm_model_func")
# Build entity_types map for type-aware fuzzy matching.
# Use first non-empty type for entities with multiple occurrences.
entity_types: dict[str, str] = {}
for entity_name, entities in all_nodes.items():
for entity_data in entities:
etype = entity_data.get("entity_type", "")
if etype:
entity_types[entity_name] = etype
break
# Collect ALL entity names: from entities AND from relation endpoints
# This ensures relation endpoints like "EU Medicines Agency" get resolved
# against existing entities like "European Medicines Agency"
all_entity_names = set(all_nodes.keys())
for src_id, tgt_id in all_edges.keys():
all_entity_names.add(src_id)
all_entity_names.add(tgt_id)
pre_resolution_map, confidence_map = await _build_pre_resolution_map(
list(all_entity_names),
entity_types,
entity_vdb,
llm_fn,
config,
)
# Cache pre-resolution aliases for future lookups (PostgreSQL-only)
# This ensures aliases discovered during batch processing are available
# for subsequent document ingestion without re-running resolution
db = getattr(knowledge_graph_inst, "db", None)
if db is not None and pre_resolution_map:
workspace = global_config.get("workspace", "")
for alias, canonical in pre_resolution_map.items():
# Don't cache self-references (entity → itself)
if alias.lower().strip() != canonical.lower().strip():
try:
await store_alias(
alias=alias,
canonical=canonical,
method="pre_resolution",
confidence=confidence_map.get(alias, 1.0),
db=db,
workspace=workspace,
)
except Exception as e:
logger.debug(
f"Failed to cache pre-resolution alias "
f"'{alias}''{canonical}': {e}"
)
# ===== Phase 1: Process all entities concurrently =====
log_message = f"Phase 1: Processing {total_entities_count} entities from {doc_id} (async: {graph_max_async})"
logger.info(log_message)
@ -2479,6 +2862,11 @@ async def merge_nodes_and_edges(
pipeline_status["latest_message"] = log_message
pipeline_status["history_messages"].append(log_message)
# Resolution map to track original→resolved entity names (e.g., "Dupixant"→"Dupixent")
# This will be used to remap edge endpoints in Phase 2
entity_resolution_map: dict[str, str] = {}
resolution_map_lock = asyncio.Lock()
async def _locked_process_entity_name(entity_name, entities):
async with semaphore:
# Check for cancellation before processing entity
@ -2496,7 +2884,7 @@ async def merge_nodes_and_edges(
):
try:
logger.debug(f"Processing entity {entity_name}")
entity_data = await _merge_nodes_then_upsert(
entity_data, resolved_from = await _merge_nodes_then_upsert(
entity_name,
entities,
knowledge_graph_inst,
@ -2506,8 +2894,15 @@ async def merge_nodes_and_edges(
pipeline_status_lock,
llm_response_cache,
entity_chunks_storage,
pre_resolution_map,
)
# Track resolution mapping for edge remapping in Phase 2
if resolved_from is not None:
resolved_to = entity_data.get("entity_name", entity_name)
async with resolution_map_lock:
entity_resolution_map[resolved_from] = resolved_to
return entity_data
except Exception as e:
@ -2617,6 +3012,7 @@ async def merge_nodes_and_edges(
added_entities, # Pass list to collect added entities
relation_chunks_storage,
entity_chunks_storage, # Add entity_chunks_storage parameter
entity_resolution_map, # Apply entity resolution to edge endpoints
)
if edge_data is None:
@ -2649,9 +3045,36 @@ async def merge_nodes_and_edges(
raise prefixed_exception from e
# Create relationship processing tasks
edge_tasks = []
# Apply pre_resolution_map to edge endpoints to prevent duplicates from relation extraction
# Key fixes: sort for lock ordering, filter self-loops, deduplicate merged edges
resolved_edges: dict[tuple[str, str], list] = {}
for edge_key, edges in all_edges.items():
task = asyncio.create_task(_locked_process_edges(edge_key, edges))
# Remap edge endpoints using pre-resolution map
# This catches cases like "EU Medicines Agency" → "European Medicines Agency"
resolved_src = pre_resolution_map.get(edge_key[0], edge_key[0])
resolved_tgt = pre_resolution_map.get(edge_key[1], edge_key[1])
# Skip self-loops created by resolution (e.g., both endpoints resolve to same entity)
if resolved_src == resolved_tgt:
logger.debug(
f"Skipping self-loop after resolution: {edge_key} → ({resolved_src}, {resolved_tgt})"
)
continue
# Sort for consistent lock ordering (prevents deadlocks)
resolved_edge_key = tuple(sorted([resolved_src, resolved_tgt]))
# Merge edges that resolve to same key (deduplication)
if resolved_edge_key not in resolved_edges:
resolved_edges[resolved_edge_key] = []
resolved_edges[resolved_edge_key].extend(edges)
# Create tasks from deduplicated edges
edge_tasks = []
for resolved_edge_key, merged_edges in resolved_edges.items():
task = asyncio.create_task(
_locked_process_edges(resolved_edge_key, merged_edges)
)
edge_tasks.append(task)
# Execute relationship tasks with error handling

View file

@ -2,7 +2,6 @@ import { useState, useCallback, useEffect, useRef } from 'react'
import ThemeProvider from '@/components/ThemeProvider'
import TabVisibilityProvider from '@/contexts/TabVisibilityProvider'
import ApiKeyAlert from '@/components/ApiKeyAlert'
import StatusIndicator from '@/components/status/StatusIndicator'
import { SiteInfo, webuiPrefix } from '@/lib/constants'
import { useBackendState, useAuthStore } from '@/stores/state'
import { useSettingsStore } from '@/stores/settings'
@ -218,7 +217,6 @@ function App() {
</TabsContent>
</div>
</Tabs>
{enableHealthCheck && <StatusIndicator />}
<ApiKeyAlert open={apiKeyAlertOpen} onOpenChange={handleApiKeyAlertOpenChange} />
</main>
)}

View file

@ -1,9 +1,7 @@
import { useState, useCallback } from 'react'
import { Popover, PopoverContent, PopoverTrigger } from '@/components/ui/Popover'
import { useCallback } from 'react'
import Button from '@/components/ui/Button'
import { Select, SelectContent, SelectItem, SelectTrigger, SelectValue } from '@/components/ui/Select'
import { useSettingsStore } from '@/stores/settings'
import { PaletteIcon } from 'lucide-react'
import { SunIcon, MoonIcon } from 'lucide-react'
import { useTranslation } from 'react-i18next'
import { cn } from '@/lib/utils'
@ -12,63 +10,39 @@ interface AppSettingsProps {
}
export default function AppSettings({ className }: AppSettingsProps) {
const [opened, setOpened] = useState<boolean>(false)
const { t } = useTranslation()
const language = useSettingsStore.use.language()
const setLanguage = useSettingsStore.use.setLanguage()
const theme = useSettingsStore.use.theme()
const setTheme = useSettingsStore.use.setTheme()
const handleLanguageChange = useCallback((value: string) => {
setLanguage(value as 'en' | 'zh' | 'fr' | 'ar' | 'zh_TW')
}, [setLanguage])
// Compute effective theme for icon/tooltip display when theme is 'system'
const effectiveTheme = theme === 'system'
? (window.matchMedia('(prefers-color-scheme: dark)').matches ? 'dark' : 'light')
: theme
const handleThemeChange = useCallback((value: string) => {
setTheme(value as 'light' | 'dark' | 'system')
}, [setTheme])
const handleThemeToggle = useCallback(() => {
if (theme === 'system') {
// Detect actual system preference and toggle to opposite
const isDark = window.matchMedia('(prefers-color-scheme: dark)').matches
setTheme(isDark ? 'light' : 'dark')
} else {
setTheme(theme === 'dark' ? 'light' : 'dark')
}
}, [theme, setTheme])
return (
<Popover open={opened} onOpenChange={setOpened}>
<PopoverTrigger asChild>
<Button variant="ghost" size="icon" className={cn('h-9 w-9', className)}>
<PaletteIcon className="h-5 w-5" />
</Button>
</PopoverTrigger>
<PopoverContent side="bottom" align="end" className="w-56">
<div className="flex flex-col gap-4">
<div className="flex flex-col gap-2">
<label className="text-sm font-medium">{t('settings.language')}</label>
<Select value={language} onValueChange={handleLanguageChange}>
<SelectTrigger>
<SelectValue />
</SelectTrigger>
<SelectContent>
<SelectItem value="en">English</SelectItem>
<SelectItem value="zh"></SelectItem>
<SelectItem value="fr">Français</SelectItem>
<SelectItem value="ar">العربية</SelectItem>
<SelectItem value="zh_TW"></SelectItem>
</SelectContent>
</Select>
</div>
<div className="flex flex-col gap-2">
<label className="text-sm font-medium">{t('settings.theme')}</label>
<Select value={theme} onValueChange={handleThemeChange}>
<SelectTrigger>
<SelectValue />
</SelectTrigger>
<SelectContent>
<SelectItem value="light">{t('settings.light')}</SelectItem>
<SelectItem value="dark">{t('settings.dark')}</SelectItem>
<SelectItem value="system">{t('settings.system')}</SelectItem>
</SelectContent>
</Select>
</div>
</div>
</PopoverContent>
</Popover>
<Button
variant="ghost"
size="icon"
className={cn('h-9 w-9', className)}
onClick={handleThemeToggle}
tooltip={effectiveTheme === 'dark' ? t('settings.light') : t('settings.dark')}
>
{effectiveTheme === 'dark' ? (
<MoonIcon className="h-5 w-5" />
) : (
<SunIcon className="h-5 w-5" />
)}
</Button>
)
}

View file

@ -4,7 +4,7 @@ import { useEffect, useState } from 'react'
import StatusDialog from './StatusDialog'
import { useTranslation } from 'react-i18next'
const StatusIndicator = () => {
const StatusIndicator = ({ className }: { className?: string }) => {
const { t } = useTranslation()
const health = useBackendState.use.health()
const lastCheckTime = useBackendState.use.lastCheckTime()
@ -20,7 +20,7 @@ const StatusIndicator = () => {
}, [lastCheckTime])
return (
<div className="fixed right-4 bottom-4 flex items-center gap-2 opacity-80 select-none">
<div className={cn("flex items-center gap-2 opacity-80 select-none", className)}>
<div
className="flex cursor-pointer items-center gap-2"
onClick={() => setDialogOpen(true)}

View file

@ -1,13 +1,14 @@
import Button from '@/components/ui/Button'
import { SiteInfo, webuiPrefix } from '@/lib/constants'
import { webuiPrefix } from '@/lib/constants'
import AppSettings from '@/components/AppSettings'
import StatusIndicator from '@/components/status/StatusIndicator'
import { TabsList, TabsTrigger } from '@/components/ui/Tabs'
import { useSettingsStore } from '@/stores/settings'
import { useAuthStore } from '@/stores/state'
import { cn } from '@/lib/utils'
import { useTranslation } from 'react-i18next'
import { navigationService } from '@/services/navigation'
import { ZapIcon, GithubIcon, LogOutIcon } from 'lucide-react'
import { ZapIcon, LogOutIcon } from 'lucide-react'
import { Tooltip, TooltipContent, TooltipProvider, TooltipTrigger } from '@/components/ui/Tooltip'
interface NavigationTabProps {
@ -56,17 +57,8 @@ function TabsNavigation() {
export default function SiteHeader() {
const { t } = useTranslation()
const { isGuestMode, coreVersion, apiVersion, username, webuiTitle, webuiDescription } = useAuthStore()
const versionDisplay = (coreVersion && apiVersion)
? `${coreVersion}/${apiVersion}`
: null;
// Check if frontend needs rebuild (apiVersion ends with warning symbol)
const hasWarning = apiVersion?.endsWith('⚠️');
const versionTooltip = hasWarning
? t('header.frontendNeedsRebuild')
: versionDisplay ? `v${versionDisplay}` : '';
const { isGuestMode, username, webuiTitle, webuiDescription } = useAuthStore()
const enableHealthCheck = useSettingsStore.use.enableHealthCheck()
const handleLogout = () => {
navigationService.navigateToLogin();
@ -77,7 +69,6 @@ export default function SiteHeader() {
<div className="min-w-[200px] w-auto flex items-center">
<a href={webuiPrefix} className="flex items-center gap-2">
<ZapIcon className="size-4 text-emerald-400" aria-hidden="true" />
<span className="font-bold md:inline-block">{SiteInfo.name}</span>
</a>
{webuiTitle && (
<div className="flex items-center">
@ -111,25 +102,7 @@ export default function SiteHeader() {
<nav className="w-[200px] flex items-center justify-end">
<div className="flex items-center gap-2">
{versionDisplay && (
<TooltipProvider>
<Tooltip>
<TooltipTrigger asChild>
<span className="text-xs text-gray-500 dark:text-gray-400 mr-1 cursor-default">
v{versionDisplay}
</span>
</TooltipTrigger>
<TooltipContent side="bottom">
{versionTooltip}
</TooltipContent>
</Tooltip>
</TooltipProvider>
)}
<Button variant="ghost" size="icon" side="bottom" tooltip={t('header.projectRepository')}>
<a href={SiteInfo.github} target="_blank" rel="noopener noreferrer">
<GithubIcon className="size-4" aria-hidden="true" />
</a>
</Button>
{enableHealthCheck && <StatusIndicator />}
<AppSettings />
{!isGuestMode && (
<Button

View file

@ -87,7 +87,7 @@ interface SettingsState {
const useSettingsStoreBase = create<SettingsState>()(
persist(
(set) => ({
theme: 'system',
theme: 'light',
language: 'en',
showPropertyPanel: true,
showNodeSearchBar: true,
@ -238,7 +238,7 @@ const useSettingsStoreBase = create<SettingsState>()(
{
name: 'settings-storage',
storage: createJSONStorage(() => localStorage),
version: 19,
version: 20,
migrate: (state: any, version: number) => {
if (version < 2) {
state.showEdgeLabel = false
@ -341,6 +341,15 @@ const useSettingsStoreBase = create<SettingsState>()(
delete state.querySettings.response_type
}
}
if (version < 20) {
// Only set defaults if values are missing (preserve user preference)
if (!state.theme) {
state.theme = 'light'
}
if (!state.language) {
state.language = 'en'
}
}
return state
}
}

View file

@ -0,0 +1 @@
# Entity resolution tests

View file

@ -0,0 +1,240 @@
"""
Unit tests for Entity Resolution
Tests the 3-layer approach with mock embed_fn and llm_fn.
No database or external services required.
"""
import pytest
from lightrag.entity_resolution import (
EntityResolutionConfig,
resolve_entity,
)
# Mock embeddings - pre-computed for test entities
# These simulate what an embedding model would return
MOCK_EMBEDDINGS = {
# FDA and full name have ~0.67 similarity (based on real test)
"fda": [0.1, 0.2, 0.3, 0.4, 0.5],
"us food and drug administration": [0.15, 0.25, 0.28, 0.38, 0.52],
# Dupixent and dupilumab have ~0.63 similarity
"dupixent": [0.5, 0.6, 0.7, 0.8, 0.9],
"dupilumab": [0.48, 0.58, 0.72, 0.78, 0.88],
# Celebrex and Cerebyx are different (low similarity)
"celebrex": [0.9, 0.1, 0.2, 0.3, 0.4],
"cerebyx": [0.1, 0.9, 0.8, 0.7, 0.6],
# Default for unknown entities
"default": [0.0, 0.0, 0.0, 0.0, 0.0],
}
# Mock LLM responses
MOCK_LLM_RESPONSES = {
("fda", "us food and drug administration"): "YES",
("us food and drug administration", "fda"): "YES",
("dupixent", "dupilumab"): "YES",
("dupilumab", "dupixent"): "YES",
("heart attack", "myocardial infarction"): "YES",
("celebrex", "cerebyx"): "NO",
("metformin", "metoprolol"): "NO",
}
async def mock_embed_fn(text: str) -> list[float]:
"""Mock embedding function."""
key = text.lower().strip()
return MOCK_EMBEDDINGS.get(key, MOCK_EMBEDDINGS["default"])
async def mock_llm_fn(prompt: str) -> str:
"""Mock LLM function that parses the prompt and returns YES/NO."""
# Extract term_a and term_b from the prompt
lines = prompt.strip().split("\n")
term_a = None
term_b = None
for line in lines:
if line.startswith("Term A:"):
term_a = line.replace("Term A:", "").strip().lower()
elif line.startswith("Term B:"):
term_b = line.replace("Term B:", "").strip().lower()
if term_a and term_b:
# Check both orderings
response = MOCK_LLM_RESPONSES.get((term_a, term_b))
if response is None:
response = MOCK_LLM_RESPONSES.get((term_b, term_a), "NO")
return response
return "NO"
# Test fixtures
@pytest.fixture
def existing_entities():
"""Existing entities in the knowledge graph."""
return [
(
"US Food and Drug Administration",
MOCK_EMBEDDINGS["us food and drug administration"],
),
("Dupixent", MOCK_EMBEDDINGS["dupixent"]),
("Celebrex", MOCK_EMBEDDINGS["celebrex"]),
]
@pytest.fixture
def config():
"""Default resolution config."""
return EntityResolutionConfig()
# Layer 1: Case normalization tests
class TestCaseNormalization:
@pytest.mark.asyncio
async def test_exact_match_same_case(self, existing_entities, config):
"""Exact match with same case."""
result = await resolve_entity(
"Dupixent",
existing_entities,
mock_embed_fn,
mock_llm_fn,
config,
)
assert result.action == "match"
assert result.matched_entity == "Dupixent"
assert result.method == "exact"
assert result.confidence == 1.0
@pytest.mark.asyncio
async def test_exact_match_different_case(self, existing_entities, config):
"""DUPIXENT should match Dupixent via case normalization."""
result = await resolve_entity(
"DUPIXENT",
existing_entities,
mock_embed_fn,
mock_llm_fn,
config,
)
assert result.action == "match"
assert result.matched_entity == "Dupixent"
assert result.method == "exact"
@pytest.mark.asyncio
async def test_exact_match_lowercase(self, existing_entities, config):
"""dupixent should match Dupixent."""
result = await resolve_entity(
"dupixent",
existing_entities,
mock_embed_fn,
mock_llm_fn,
config,
)
assert result.action == "match"
assert result.method == "exact"
# Layer 2: Fuzzy matching tests
class TestFuzzyMatching:
@pytest.mark.asyncio
async def test_fuzzy_match_typo(self, existing_entities, config):
"""Dupixant (typo) should match Dupixent via fuzzy matching (88%)."""
result = await resolve_entity(
"Dupixant",
existing_entities,
mock_embed_fn,
mock_llm_fn,
config,
)
assert result.action == "match"
assert result.matched_entity == "Dupixent"
assert result.method == "fuzzy"
assert result.confidence >= 0.85
@pytest.mark.asyncio
async def test_fuzzy_rejects_below_threshold(self, existing_entities, config):
"""Celebrex vs Cerebyx is 67% - should NOT fuzzy match."""
# Add Cerebyx as the query (Celebrex exists)
result = await resolve_entity(
"Cerebyx",
existing_entities,
mock_embed_fn,
mock_llm_fn,
config,
)
# Should not be fuzzy match (67% < 85%)
assert result.method != "fuzzy" or result.action == "new"
# Layer 3: LLM verification tests
class TestLLMVerification:
@pytest.mark.asyncio
async def test_llm_matches_acronym(self, existing_entities, config):
"""FDA should match US Food and Drug Administration via LLM."""
result = await resolve_entity(
"FDA",
existing_entities,
mock_embed_fn,
mock_llm_fn,
config,
)
assert result.action == "match"
assert result.matched_entity == "US Food and Drug Administration"
assert result.method == "llm"
@pytest.mark.asyncio
async def test_llm_matches_brand_generic(self, config):
"""Dupixent should match dupilumab via LLM."""
existing = [
("dupilumab", MOCK_EMBEDDINGS["dupilumab"]),
]
result = await resolve_entity(
"Dupixent",
existing,
mock_embed_fn,
mock_llm_fn,
config,
)
assert result.action == "match"
assert result.matched_entity == "dupilumab"
assert result.method == "llm"
# Edge cases
class TestEdgeCases:
@pytest.mark.asyncio
async def test_empty_existing_entities(self, config):
"""New entity when no existing entities."""
result = await resolve_entity(
"NewEntity",
[],
mock_embed_fn,
mock_llm_fn,
config,
)
assert result.action == "new"
@pytest.mark.asyncio
async def test_disabled_resolution(self, existing_entities):
"""Resolution disabled returns new."""
config = EntityResolutionConfig(enabled=False)
result = await resolve_entity(
"Dupixent",
existing_entities,
mock_embed_fn,
mock_llm_fn,
config,
)
assert result.action == "new"
assert result.method == "disabled"
@pytest.mark.asyncio
async def test_genuinely_new_entity(self, existing_entities, config):
"""Completely new entity should return 'new'."""
result = await resolve_entity(
"CompletelyNewDrug",
existing_entities,
mock_embed_fn,
mock_llm_fn,
config,
)
assert result.action == "new"
assert result.method == "none"