feat: add automatic entity resolution with 3-layer matching
Implement automatic entity resolution to prevent duplicate nodes in the knowledge graph. The system uses a 3-layer approach: 1. Case-insensitive exact matching (free, instant) 2. Fuzzy string matching >85% threshold (free, instant) 3. Vector similarity + LLM verification (for acronyms/synonyms) Key features: - Pre-resolution phase prevents race conditions in parallel processing - Numeric suffix detection blocks false matches (IL-4 ≠ IL-13) - PostgreSQL alias cache for fast lookups on subsequent ingestion - Configurable thresholds via environment variables Bug fixes included: - Fix fuzzy matching false positives for numbered entities - Fix alias cache not being populated (missing db parameter) - Skip entity_aliases table from generic id index creation New files: - lightrag/entity_resolution/ - Core resolution module - tests/test_entity_resolution/ - Unit tests - docker/postgres-age-vector/ - Custom PG image with pgvector + AGE - docker-compose.test.yml - Integration test environment Configuration (env.example): - ENTITY_RESOLUTION_ENABLED=true - ENTITY_RESOLUTION_FUZZY_THRESHOLD=0.85 - ENTITY_RESOLUTION_VECTOR_THRESHOLD=0.5 - ENTITY_RESOLUTION_MAX_CANDIDATES=3
This commit is contained in:
parent
4f12fe121d
commit
48c7732edc
20 changed files with 1561 additions and 101 deletions
84
docker-compose.test.yml
Normal file
84
docker-compose.test.yml
Normal file
|
|
@ -0,0 +1,84 @@
|
|||
name: lightrag-entity-resolution-test
|
||||
|
||||
services:
|
||||
postgres:
|
||||
container_name: lightrag-postgres
|
||||
build:
|
||||
context: ./docker/postgres-age-vector
|
||||
dockerfile: Dockerfile
|
||||
environment:
|
||||
POSTGRES_DB: lightrag
|
||||
POSTGRES_USER: lightrag
|
||||
POSTGRES_PASSWORD: lightrag_pass
|
||||
ports:
|
||||
- "5433:5432" # Use 5433 to avoid conflict with agent-sdk postgres
|
||||
volumes:
|
||||
- pgdata_test:/var/lib/postgresql/data
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "pg_isready -U lightrag -d lightrag"]
|
||||
interval: 5s
|
||||
timeout: 5s
|
||||
retries: 5
|
||||
|
||||
lightrag:
|
||||
container_name: lightrag-test
|
||||
build:
|
||||
context: .
|
||||
dockerfile: Dockerfile
|
||||
ports:
|
||||
- "9622:9621" # Use 9622 to avoid conflict
|
||||
volumes:
|
||||
- ./data/rag_storage_test:/app/data/rag_storage
|
||||
- ./data/inputs_test:/app/data/inputs
|
||||
environment:
|
||||
# Server
|
||||
- HOST=0.0.0.0
|
||||
- PORT=9621
|
||||
- LOG_LEVEL=DEBUG
|
||||
|
||||
# LLM (OpenAI)
|
||||
- LLM_BINDING=openai
|
||||
- LLM_MODEL=gpt-4o-mini
|
||||
- LLM_BINDING_HOST=https://api.openai.com/v1
|
||||
- LLM_BINDING_API_KEY=${OPENAI_API_KEY}
|
||||
|
||||
# Embedding
|
||||
- EMBEDDING_BINDING=openai
|
||||
- EMBEDDING_MODEL=text-embedding-3-small
|
||||
- EMBEDDING_DIM=1536
|
||||
- EMBEDDING_BINDING_HOST=https://api.openai.com/v1
|
||||
- EMBEDDING_BINDING_API_KEY=${OPENAI_API_KEY}
|
||||
|
||||
# Storage Configuration - Full PostgreSQL!
|
||||
# Custom postgres image has pgvector + Apache AGE
|
||||
- LIGHTRAG_KV_STORAGE=PGKVStorage
|
||||
- LIGHTRAG_VECTOR_STORAGE=PGVectorStorage
|
||||
- LIGHTRAG_GRAPH_STORAGE=PGGraphStorage
|
||||
- LIGHTRAG_DOC_STATUS_STORAGE=PGDocStatusStorage
|
||||
- POSTGRES_HOST=postgres
|
||||
- POSTGRES_PORT=5432
|
||||
- POSTGRES_USER=lightrag
|
||||
- POSTGRES_PASSWORD=lightrag_pass
|
||||
- POSTGRES_DATABASE=lightrag
|
||||
|
||||
# Entity Resolution - ENABLED!
|
||||
- ENTITY_RESOLUTION_ENABLED=true
|
||||
- ENTITY_RESOLUTION_FUZZY_THRESHOLD=0.85
|
||||
- ENTITY_RESOLUTION_VECTOR_THRESHOLD=0.5
|
||||
- ENTITY_RESOLUTION_MAX_CANDIDATES=3
|
||||
|
||||
# Processing
|
||||
- MAX_ASYNC=4
|
||||
- CHUNK_SIZE=1200
|
||||
depends_on:
|
||||
postgres:
|
||||
condition: service_healthy
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "curl -f http://localhost:9621/health || exit 1"]
|
||||
interval: 10s
|
||||
timeout: 5s
|
||||
retries: 10
|
||||
start_period: 30s
|
||||
|
||||
volumes:
|
||||
pgdata_test:
|
||||
26
docker/postgres-age-vector/Dockerfile
Normal file
26
docker/postgres-age-vector/Dockerfile
Normal file
|
|
@ -0,0 +1,26 @@
|
|||
# Start from pgvector image (has vector extension pre-built correctly)
|
||||
FROM pgvector/pgvector:pg17
|
||||
|
||||
# Install build dependencies for AGE
|
||||
RUN apt-get update && apt-get install -y \
|
||||
build-essential \
|
||||
git \
|
||||
postgresql-server-dev-17 \
|
||||
libreadline-dev \
|
||||
zlib1g-dev \
|
||||
flex \
|
||||
bison \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install Apache AGE 1.6.0 for PG17
|
||||
RUN cd /tmp \
|
||||
&& git clone --branch release/PG17/1.6.0 https://github.com/apache/age.git \
|
||||
&& cd age \
|
||||
&& make \
|
||||
&& make install \
|
||||
&& rm -rf /tmp/age
|
||||
|
||||
# Add initialization script to create extensions
|
||||
RUN echo "CREATE EXTENSION IF NOT EXISTS vector;" > /docker-entrypoint-initdb.d/01-vector.sql \
|
||||
&& echo "CREATE EXTENSION IF NOT EXISTS age;" > /docker-entrypoint-initdb.d/02-age.sql \
|
||||
&& echo "SET search_path = ag_catalog, public;" >> /docker-entrypoint-initdb.d/02-age.sql
|
||||
10
env.example
10
env.example
|
|
@ -127,6 +127,16 @@ SUMMARY_LANGUAGE=English
|
|||
### Entity types that the LLM will attempt to recognize
|
||||
# ENTITY_TYPES='["Person", "Creature", "Organization", "Location", "Event", "Concept", "Method", "Content", "Data", "Artifact", "NaturalObject"]'
|
||||
|
||||
###########################################################
|
||||
### Entity Resolution Configuration
|
||||
### Automatically deduplicates entities (e.g., "FDA" = "US Food and Drug Administration")
|
||||
### Uses 3-layer approach: case normalization → fuzzy matching → LLM verification
|
||||
###########################################################
|
||||
# ENTITY_RESOLUTION_ENABLED=true
|
||||
# ENTITY_RESOLUTION_FUZZY_THRESHOLD=0.85
|
||||
# ENTITY_RESOLUTION_VECTOR_THRESHOLD=0.5
|
||||
# ENTITY_RESOLUTION_MAX_CANDIDATES=3
|
||||
|
||||
### Chunk size for document splitting, 500~1500 is recommended
|
||||
# CHUNK_SIZE=1200
|
||||
# CHUNK_OVERLAP_SIZE=100
|
||||
|
|
|
|||
93
examples/test_entity_resolution.py
Normal file
93
examples/test_entity_resolution.py
Normal file
|
|
@ -0,0 +1,93 @@
|
|||
"""
|
||||
Quick test for Entity Resolution feature.
|
||||
|
||||
Tests that:
|
||||
1. "FDA" and "US Food and Drug Administration" resolve to the same entity
|
||||
2. "Dupixant" (typo) matches "Dupixent" via fuzzy matching
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import shutil
|
||||
|
||||
from lightrag import LightRAG
|
||||
from lightrag.entity_resolution import EntityResolutionConfig
|
||||
from lightrag.llm.openai import gpt_4o_mini_complete, openai_embed
|
||||
from lightrag.utils import logger
|
||||
import logging
|
||||
|
||||
WORKING_DIR = "./test_entity_resolution"
|
||||
|
||||
# Test document with entities that should be deduplicated
|
||||
TEST_DOC = """
|
||||
The FDA approved Dupixent for treating eczema in 2017.
|
||||
The US Food and Drug Administration later expanded the drug's indications.
|
||||
Dupixant (sometimes misspelled) has shown good results in clinical trials.
|
||||
The FDA continues to monitor the safety of Dupixent.
|
||||
"""
|
||||
|
||||
|
||||
async def main():
|
||||
if not os.getenv("OPENAI_API_KEY"):
|
||||
print("Error: Set OPENAI_API_KEY environment variable")
|
||||
return
|
||||
|
||||
# Clean up previous test
|
||||
if os.path.exists(WORKING_DIR):
|
||||
shutil.rmtree(WORKING_DIR)
|
||||
os.makedirs(WORKING_DIR)
|
||||
|
||||
# Set up logging to see resolution messages
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
logger.setLevel(logging.DEBUG)
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("Entity Resolution Test")
|
||||
print("=" * 60)
|
||||
|
||||
rag = LightRAG(
|
||||
working_dir=WORKING_DIR,
|
||||
embedding_func=openai_embed,
|
||||
llm_model_func=gpt_4o_mini_complete,
|
||||
entity_resolution_config=EntityResolutionConfig(
|
||||
enabled=True,
|
||||
fuzzy_threshold=0.85,
|
||||
vector_threshold=0.5,
|
||||
max_candidates=3,
|
||||
),
|
||||
)
|
||||
|
||||
await rag.initialize_storages()
|
||||
|
||||
print("\nInserting test document...")
|
||||
print(f"Document: {TEST_DOC.strip()}")
|
||||
print("\n" + "-" * 60)
|
||||
|
||||
await rag.ainsert(TEST_DOC)
|
||||
|
||||
print("\n" + "-" * 60)
|
||||
print("Checking extracted entities...")
|
||||
|
||||
# Read the graph to see what entities were created
|
||||
graph_file = os.path.join(WORKING_DIR, "graph_chunk_entity_relation.graphml")
|
||||
if os.path.exists(graph_file):
|
||||
import networkx as nx
|
||||
|
||||
G = nx.read_graphml(graph_file)
|
||||
print(f"\nEntities in graph ({len(G.nodes())} total):")
|
||||
for node in sorted(G.nodes()):
|
||||
print(f" - {node}")
|
||||
|
||||
print(f"\nRelationships: {len(G.edges())}")
|
||||
else:
|
||||
print("Graph file not found")
|
||||
|
||||
await rag.finalize_storages()
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("Test complete!")
|
||||
print("=" * 60)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
|
|
@ -450,6 +450,20 @@ def parse_args() -> argparse.Namespace:
|
|||
"EMBEDDING_TOKEN_LIMIT", None, int, special_none=True
|
||||
)
|
||||
|
||||
# Entity Resolution configuration
|
||||
args.entity_resolution_enabled = get_env_value(
|
||||
"ENTITY_RESOLUTION_ENABLED", False, bool
|
||||
)
|
||||
args.entity_resolution_fuzzy_threshold = get_env_value(
|
||||
"ENTITY_RESOLUTION_FUZZY_THRESHOLD", 0.85, float
|
||||
)
|
||||
args.entity_resolution_vector_threshold = get_env_value(
|
||||
"ENTITY_RESOLUTION_VECTOR_THRESHOLD", 0.5, float
|
||||
)
|
||||
args.entity_resolution_max_candidates = get_env_value(
|
||||
"ENTITY_RESOLUTION_MAX_CANDIDATES", 3, int
|
||||
)
|
||||
|
||||
ollama_server_infos.LIGHTRAG_NAME = args.simulated_model_name
|
||||
ollama_server_infos.LIGHTRAG_TAG = args.simulated_model_tag
|
||||
|
||||
|
|
|
|||
|
|
@ -2,6 +2,8 @@
|
|||
LightRAG FastAPI Server
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from fastapi import FastAPI, Depends, HTTPException, Request
|
||||
from fastapi.exceptions import RequestValidationError
|
||||
from fastapi.responses import JSONResponse
|
||||
|
|
@ -642,6 +644,23 @@ def create_app(args):
|
|||
raise Exception(f"Failed to import {binding} options: {e}")
|
||||
return {}
|
||||
|
||||
def create_entity_resolution_config(args) -> object | None:
|
||||
"""
|
||||
Create EntityResolutionConfig from command line/env arguments.
|
||||
Returns None if entity resolution is disabled.
|
||||
"""
|
||||
if not args.entity_resolution_enabled:
|
||||
return None
|
||||
|
||||
from lightrag.entity_resolution import EntityResolutionConfig
|
||||
|
||||
return EntityResolutionConfig(
|
||||
enabled=True,
|
||||
fuzzy_threshold=args.entity_resolution_fuzzy_threshold,
|
||||
vector_threshold=args.entity_resolution_vector_threshold,
|
||||
max_candidates=args.entity_resolution_max_candidates,
|
||||
)
|
||||
|
||||
def create_optimized_embedding_function(
|
||||
config_cache: LLMConfigCache, binding, model, host, api_key, args
|
||||
) -> EmbeddingFunc:
|
||||
|
|
@ -1029,6 +1048,7 @@ def create_app(args):
|
|||
"entity_types": args.entity_types,
|
||||
},
|
||||
ollama_server_infos=ollama_server_infos,
|
||||
entity_resolution_config=create_entity_resolution_config(args),
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to initialize LightRAG: {e}")
|
||||
|
|
|
|||
|
|
@ -443,7 +443,7 @@ class DocStatusResponse(BaseModel):
|
|||
metadata: Optional[dict[str, Any]] = Field(
|
||||
default=None, description="Additional metadata about the document"
|
||||
)
|
||||
file_path: str = Field(description="Path to the document file")
|
||||
file_path: Optional[str] = Field(default=None, description="Path to the document file")
|
||||
|
||||
class Config:
|
||||
json_schema_extra = {
|
||||
|
|
|
|||
29
lightrag/entity_resolution/__init__.py
Normal file
29
lightrag/entity_resolution/__init__.py
Normal file
|
|
@ -0,0 +1,29 @@
|
|||
"""
|
||||
Entity Resolution Module for LightRAG
|
||||
|
||||
Provides automatic entity deduplication using a 3-layer approach:
|
||||
1. Case normalization (exact match)
|
||||
2. Fuzzy string matching (typos)
|
||||
3. Vector similarity + LLM verification (semantic matches)
|
||||
"""
|
||||
|
||||
from .resolver import (
|
||||
resolve_entity,
|
||||
resolve_entity_with_vdb,
|
||||
ResolutionResult,
|
||||
get_cached_alias,
|
||||
store_alias,
|
||||
fuzzy_similarity,
|
||||
)
|
||||
from .config import EntityResolutionConfig, DEFAULT_CONFIG
|
||||
|
||||
__all__ = [
|
||||
"resolve_entity",
|
||||
"resolve_entity_with_vdb",
|
||||
"ResolutionResult",
|
||||
"EntityResolutionConfig",
|
||||
"DEFAULT_CONFIG",
|
||||
"get_cached_alias",
|
||||
"store_alias",
|
||||
"fuzzy_similarity",
|
||||
]
|
||||
57
lightrag/entity_resolution/config.py
Normal file
57
lightrag/entity_resolution/config.py
Normal file
|
|
@ -0,0 +1,57 @@
|
|||
"""Configuration for Entity Resolution
|
||||
|
||||
Uses the same LLM that LightRAG is configured with - no separate model config needed.
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
|
||||
@dataclass
|
||||
class EntityResolutionConfig:
|
||||
"""Configuration for the entity resolution system."""
|
||||
|
||||
# Whether entity resolution is enabled
|
||||
enabled: bool = True
|
||||
|
||||
# Fuzzy pre-resolution: Enable/disable within-batch fuzzy matching before
|
||||
# VDB lookup. When enabled, entities in the same batch are matched by string
|
||||
# similarity alone. Set to False to skip fuzzy pre-resolution entirely (only
|
||||
# exact case-insensitive matches will be accepted within batch; all other
|
||||
# resolution goes to VDB/LLM). Disabling reduces false positives but may
|
||||
# miss obvious typo corrections.
|
||||
fuzzy_pre_resolution_enabled: bool = True
|
||||
|
||||
# Fuzzy string matching threshold (0-1)
|
||||
# Above this = auto-match (catches typos like Dupixant/Dupixent at 0.88)
|
||||
# Below this = continue to vector search
|
||||
# Tuning advice:
|
||||
# 0.90+ = Very conservative, near-identical strings (Dupixent/Dupixant)
|
||||
# 0.85 = Balanced default, catches typos, avoids most false positives
|
||||
# 0.80 = Aggressive, may merge distinct entities with similar names
|
||||
# <0.75 = Not recommended, high false positive risk (Celebrex/Cerebyx=0.67)
|
||||
# Test with your domain data; pharmaceutical names need higher thresholds.
|
||||
fuzzy_threshold: float = 0.85
|
||||
|
||||
# Vector similarity threshold for finding candidates
|
||||
# Low threshold = cast wide net, LLM will verify
|
||||
# 0.5 catches FDA/US Food and Drug Administration at 0.67
|
||||
vector_threshold: float = 0.5
|
||||
|
||||
# Maximum number of vector candidates to verify with LLM
|
||||
# Limits cost - uses same LLM as LightRAG main config
|
||||
max_candidates: int = 3
|
||||
|
||||
# LLM verification prompt template
|
||||
llm_prompt_template: str = field(
|
||||
default="""Are these two terms referring to the same entity?
|
||||
Consider typos, misspellings, abbreviations, or alternate names.
|
||||
|
||||
Term A: {term_a}
|
||||
Term B: {term_b}
|
||||
|
||||
Answer only YES or NO.""",
|
||||
)
|
||||
|
||||
|
||||
# Default configuration
|
||||
DEFAULT_CONFIG = EntityResolutionConfig()
|
||||
335
lightrag/entity_resolution/resolver.py
Normal file
335
lightrag/entity_resolution/resolver.py
Normal file
|
|
@ -0,0 +1,335 @@
|
|||
"""Entity Resolution - 3-Layer Approach
|
||||
|
||||
Layer 1: Case normalization (exact match)
|
||||
Layer 2: Fuzzy string matching (>85% = typos)
|
||||
Layer 3: Vector similarity + LLM verification (semantic matches)
|
||||
|
||||
Uses the same LLM that LightRAG is configured with.
|
||||
"""
|
||||
|
||||
from collections.abc import Awaitable, Callable
|
||||
from dataclasses import dataclass
|
||||
from difflib import SequenceMatcher
|
||||
|
||||
import numpy as np
|
||||
|
||||
from lightrag.utils import logger
|
||||
from .config import DEFAULT_CONFIG, EntityResolutionConfig
|
||||
|
||||
|
||||
@dataclass
|
||||
class ResolutionResult:
|
||||
"""Result of entity resolution attempt."""
|
||||
|
||||
action: str # "match" | "new"
|
||||
matched_entity: str | None
|
||||
confidence: float
|
||||
method: str # "exact" | "fuzzy" | "llm" | "none" | "disabled"
|
||||
|
||||
|
||||
def cosine_similarity(a: list[float], b: list[float]) -> float:
|
||||
"""Calculate cosine similarity between two vectors."""
|
||||
a_arr, b_arr = np.array(a), np.array(b)
|
||||
norm_a, norm_b = np.linalg.norm(a_arr), np.linalg.norm(b_arr)
|
||||
if norm_a == 0 or norm_b == 0:
|
||||
return 0.0
|
||||
return float(np.dot(a_arr, b_arr) / (norm_a * norm_b))
|
||||
|
||||
|
||||
def fuzzy_similarity(a: str, b: str) -> float:
|
||||
"""Calculate fuzzy string similarity (0-1)."""
|
||||
return SequenceMatcher(None, a.lower().strip(), b.lower().strip()).ratio()
|
||||
|
||||
|
||||
def find_vector_candidates(
|
||||
query_embedding: list[float],
|
||||
existing_entities: list[tuple[str, list[float]]],
|
||||
threshold: float,
|
||||
) -> list[tuple[str, float]]:
|
||||
"""Find entities with vector similarity above threshold."""
|
||||
candidates = []
|
||||
for name, embedding in existing_entities:
|
||||
sim = cosine_similarity(query_embedding, embedding)
|
||||
if sim >= threshold:
|
||||
candidates.append((name, sim))
|
||||
# Sort by similarity descending
|
||||
candidates.sort(key=lambda x: x[1], reverse=True)
|
||||
return candidates
|
||||
|
||||
|
||||
async def llm_verify(
|
||||
term_a: str,
|
||||
term_b: str,
|
||||
llm_fn: Callable[[str], Awaitable[str]],
|
||||
prompt_template: str,
|
||||
) -> bool:
|
||||
"""Ask LLM if two terms refer to the same entity.
|
||||
|
||||
Uses strict parsing with exact token matching only. Accepted responses:
|
||||
- Positive: "YES", "TRUE", "SAME", "MATCH"
|
||||
- Negative: "NO", "FALSE", "DIFFERENT", "NOT SAME"
|
||||
Any other response defaults to False to avoid false positive merges.
|
||||
"""
|
||||
prompt = prompt_template.format(term_a=term_a, term_b=term_b)
|
||||
response = await llm_fn(prompt)
|
||||
|
||||
# Normalize response: strip whitespace, take first line only
|
||||
normalized = response.strip().split("\n")[0].strip().upper()
|
||||
|
||||
# Remove common trailing punctuation
|
||||
normalized = normalized.rstrip(".!,")
|
||||
|
||||
# Only accept exact tokens (no prefix/substring matching)
|
||||
if normalized in ("YES", "TRUE", "SAME", "MATCH"):
|
||||
return True
|
||||
if normalized in ("NO", "FALSE", "DIFFERENT", "NOT SAME"):
|
||||
return False
|
||||
|
||||
# Default to False for ambiguous responses (safer than false positive)
|
||||
return False
|
||||
|
||||
|
||||
async def resolve_entity(
|
||||
entity_name: str,
|
||||
existing_entities: list[tuple[str, list[float]]],
|
||||
embed_fn: Callable[[str], Awaitable[list[float]]],
|
||||
llm_fn: Callable[[str], Awaitable[str]],
|
||||
config: EntityResolutionConfig = DEFAULT_CONFIG,
|
||||
) -> ResolutionResult:
|
||||
"""Resolve an entity against existing entities using 3-layer approach.
|
||||
|
||||
Args:
|
||||
entity_name: The new entity name to resolve
|
||||
existing_entities: List of (name, embedding) tuples for existing entities
|
||||
embed_fn: Async function to get embedding for a string (same as LightRAG uses)
|
||||
llm_fn: Async function to query LLM (same as LightRAG uses)
|
||||
config: Resolution configuration
|
||||
|
||||
Returns:
|
||||
ResolutionResult with action ("match" or "new"), matched entity,
|
||||
confidence, and method used.
|
||||
"""
|
||||
if not config.enabled:
|
||||
return ResolutionResult("new", None, 0.0, "disabled")
|
||||
|
||||
if not existing_entities:
|
||||
return ResolutionResult("new", None, 0.0, "none")
|
||||
|
||||
normalized = entity_name.lower().strip()
|
||||
|
||||
# Layer 1: Case-insensitive exact match
|
||||
for name, _ in existing_entities:
|
||||
if name.lower().strip() == normalized:
|
||||
return ResolutionResult("match", name, 1.0, "exact")
|
||||
|
||||
# Layer 2: Fuzzy string matching (catches typos)
|
||||
best_fuzzy_match = None
|
||||
best_fuzzy_score = 0.0
|
||||
|
||||
for name, _ in existing_entities:
|
||||
similarity = fuzzy_similarity(entity_name, name)
|
||||
if similarity > best_fuzzy_score:
|
||||
best_fuzzy_score = similarity
|
||||
best_fuzzy_match = name
|
||||
|
||||
if best_fuzzy_score >= config.fuzzy_threshold:
|
||||
return ResolutionResult("match", best_fuzzy_match, best_fuzzy_score, "fuzzy")
|
||||
|
||||
# Layer 3: Vector similarity + LLM verification
|
||||
embedding = await embed_fn(entity_name)
|
||||
candidates = find_vector_candidates(
|
||||
embedding,
|
||||
existing_entities,
|
||||
config.vector_threshold,
|
||||
)
|
||||
|
||||
# Verify top candidates with LLM
|
||||
for candidate_name, similarity in candidates[: config.max_candidates]:
|
||||
is_same = await llm_verify(
|
||||
entity_name,
|
||||
candidate_name,
|
||||
llm_fn,
|
||||
config.llm_prompt_template,
|
||||
)
|
||||
if is_same:
|
||||
return ResolutionResult("match", candidate_name, similarity, "llm")
|
||||
|
||||
# No match found - this is a new entity
|
||||
return ResolutionResult("new", None, 0.0, "none")
|
||||
|
||||
|
||||
async def resolve_entity_with_vdb(
|
||||
entity_name: str,
|
||||
entity_vdb, # BaseVectorStorage - imported dynamically to avoid circular imports
|
||||
llm_fn: Callable[[str], Awaitable[str]],
|
||||
config: EntityResolutionConfig = DEFAULT_CONFIG,
|
||||
) -> ResolutionResult:
|
||||
"""Resolve an entity using VDB for similarity search.
|
||||
|
||||
This is the production integration that uses LightRAG's vector database
|
||||
directly instead of requiring pre-computed embeddings.
|
||||
|
||||
Args:
|
||||
entity_name: The new entity name to resolve
|
||||
entity_vdb: LightRAG's entity vector database (BaseVectorStorage)
|
||||
llm_fn: Async function to query LLM (same as LightRAG uses)
|
||||
config: Resolution configuration
|
||||
|
||||
Returns:
|
||||
ResolutionResult with action ("match" or "new"), matched entity,
|
||||
confidence, and method used.
|
||||
"""
|
||||
if not config.enabled:
|
||||
return ResolutionResult("new", None, 0.0, "disabled")
|
||||
|
||||
if entity_vdb is None:
|
||||
return ResolutionResult("new", None, 0.0, "none")
|
||||
|
||||
normalized = entity_name.lower().strip()
|
||||
|
||||
# Query VDB for similar entities - cast wide net, LLM will verify
|
||||
# top_k is doubled to have enough candidates after filtering
|
||||
try:
|
||||
candidates = await entity_vdb.query(
|
||||
entity_name, top_k=config.max_candidates * 3
|
||||
)
|
||||
except Exception as e:
|
||||
# Log and skip resolution if VDB query fails
|
||||
logger.debug(f"VDB query failed for '{entity_name}': {e}")
|
||||
return ResolutionResult("new", None, 0.0, "none")
|
||||
|
||||
if not candidates:
|
||||
return ResolutionResult("new", None, 0.0, "none")
|
||||
|
||||
# Layer 1: Case-insensitive exact match among candidates
|
||||
for candidate in candidates:
|
||||
candidate_name = candidate.get("entity_name")
|
||||
if candidate_name and candidate_name.lower().strip() == normalized:
|
||||
return ResolutionResult("match", candidate_name, 1.0, "exact")
|
||||
|
||||
# Layer 2: Fuzzy string matching (catches typos)
|
||||
best_fuzzy_match = None
|
||||
best_fuzzy_score = 0.0
|
||||
|
||||
for candidate in candidates:
|
||||
candidate_name = candidate.get("entity_name")
|
||||
if not candidate_name:
|
||||
continue
|
||||
similarity = fuzzy_similarity(entity_name, candidate_name)
|
||||
if similarity > best_fuzzy_score:
|
||||
best_fuzzy_score = similarity
|
||||
best_fuzzy_match = candidate_name
|
||||
|
||||
if best_fuzzy_score >= config.fuzzy_threshold:
|
||||
return ResolutionResult("match", best_fuzzy_match, best_fuzzy_score, "fuzzy")
|
||||
|
||||
# Layer 3: LLM verification on top candidates
|
||||
verified_count = 0
|
||||
for candidate in candidates:
|
||||
if verified_count >= config.max_candidates:
|
||||
break
|
||||
candidate_name = candidate.get("entity_name")
|
||||
if not candidate_name:
|
||||
continue
|
||||
|
||||
is_same = await llm_verify(
|
||||
entity_name,
|
||||
candidate_name,
|
||||
llm_fn,
|
||||
config.llm_prompt_template,
|
||||
)
|
||||
verified_count += 1
|
||||
|
||||
if is_same:
|
||||
# Use distance from VDB if available (converted to similarity)
|
||||
similarity = 0.7 # Default confidence for LLM match
|
||||
return ResolutionResult("match", candidate_name, similarity, "llm")
|
||||
|
||||
# No match found - this is a new entity
|
||||
return ResolutionResult("new", None, 0.0, "none")
|
||||
|
||||
|
||||
# --- Alias Cache Functions (PostgreSQL) ---
|
||||
|
||||
|
||||
async def get_cached_alias(
|
||||
alias: str,
|
||||
db, # PostgresDB instance
|
||||
workspace: str,
|
||||
) -> tuple[str, str, float] | None:
|
||||
"""Check if alias is already resolved in cache.
|
||||
|
||||
Args:
|
||||
alias: The entity name to look up
|
||||
db: PostgresDB instance with query method
|
||||
workspace: Workspace for isolation
|
||||
|
||||
Returns:
|
||||
Tuple of (canonical_entity, method, confidence) if found, None otherwise
|
||||
"""
|
||||
import logging
|
||||
|
||||
from lightrag.kg.postgres_impl import SQL_TEMPLATES
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
normalized_alias = alias.lower().strip()
|
||||
|
||||
sql = SQL_TEMPLATES["get_alias"]
|
||||
try:
|
||||
result = await db.query(sql, params=[workspace, normalized_alias])
|
||||
if result:
|
||||
return (
|
||||
result["canonical_entity"],
|
||||
result["method"],
|
||||
result["confidence"],
|
||||
)
|
||||
except Exception as e:
|
||||
logger.debug(f"Alias cache lookup error: {e}")
|
||||
return None
|
||||
|
||||
|
||||
async def store_alias(
|
||||
alias: str,
|
||||
canonical: str,
|
||||
method: str,
|
||||
confidence: float,
|
||||
db, # PostgresDB instance
|
||||
workspace: str,
|
||||
) -> None:
|
||||
"""Store a resolution in the alias cache.
|
||||
|
||||
Args:
|
||||
alias: The variant name (e.g., "FDA")
|
||||
canonical: The resolved canonical name (e.g., "US Food and Drug Administration")
|
||||
method: How it was resolved ('exact', 'fuzzy', 'llm', 'manual')
|
||||
confidence: Resolution confidence (0-1)
|
||||
db: PostgresDB instance with execute method
|
||||
workspace: Workspace for isolation
|
||||
"""
|
||||
import logging
|
||||
from datetime import datetime, timezone
|
||||
|
||||
from lightrag.kg.postgres_impl import SQL_TEMPLATES
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
normalized_alias = alias.lower().strip()
|
||||
|
||||
# Don't store self-referential aliases (e.g., "FDA" → "FDA")
|
||||
if normalized_alias == canonical.lower().strip():
|
||||
return
|
||||
|
||||
sql = SQL_TEMPLATES["upsert_alias"]
|
||||
try:
|
||||
await db.execute(
|
||||
sql,
|
||||
data={
|
||||
"workspace": workspace,
|
||||
"alias": normalized_alias,
|
||||
"canonical_entity": canonical,
|
||||
"method": method,
|
||||
"confidence": confidence,
|
||||
"create_time": datetime.now(timezone.utc).replace(tzinfo=None),
|
||||
},
|
||||
)
|
||||
except Exception as e:
|
||||
logger.debug(f"Alias cache store error: {e}")
|
||||
|
|
@ -1130,7 +1130,14 @@ class PostgreSQLDB:
|
|||
existing_indexes = {row["indexname"] for row in existing_indexes_result}
|
||||
|
||||
# Create missing indexes
|
||||
# Tables that don't have an 'id' column (use different primary key structure)
|
||||
tables_without_id = {"LIGHTRAG_ENTITY_ALIASES"}
|
||||
|
||||
for k in table_names:
|
||||
# Skip tables that don't have an 'id' column
|
||||
if k in tables_without_id:
|
||||
continue
|
||||
|
||||
# Create index for id column if missing
|
||||
index_name = f"idx_{k.lower()}_id"
|
||||
if index_name not in existing_indexes:
|
||||
|
|
@ -1259,6 +1266,12 @@ class PostgreSQLDB:
|
|||
f"PostgreSQL, Failed to create full entities/relations tables: {e}"
|
||||
)
|
||||
|
||||
# Migrate entity aliases table to add update_time, index on canonical_entity, and confidence constraint
|
||||
try:
|
||||
await self._migrate_entity_aliases_schema()
|
||||
except Exception as e:
|
||||
logger.error(f"PostgreSQL, Failed to migrate entity aliases schema: {e}")
|
||||
|
||||
async def _migrate_create_full_entities_relations_tables(self):
|
||||
"""Create LIGHTRAG_FULL_ENTITIES and LIGHTRAG_FULL_RELATIONS tables if they don't exist"""
|
||||
tables_to_check = [
|
||||
|
|
@ -1323,6 +1336,127 @@ class PostgreSQLDB:
|
|||
except Exception as e:
|
||||
logger.error(f"Failed to create table {table_name}: {e}")
|
||||
|
||||
async def _migrate_entity_aliases_schema(self):
|
||||
"""Migrate LIGHTRAG_ENTITY_ALIASES table to add update_time column, canonical index, and confidence constraint"""
|
||||
table_name = "LIGHTRAG_ENTITY_ALIASES"
|
||||
|
||||
# Check if table exists first
|
||||
check_table_sql = """
|
||||
SELECT table_name
|
||||
FROM information_schema.tables
|
||||
WHERE table_name = $1
|
||||
AND table_schema = 'public'
|
||||
"""
|
||||
table_exists = await self.query(check_table_sql, [table_name.lower()])
|
||||
if not table_exists:
|
||||
logger.debug(f"Table {table_name} does not exist yet, skipping migration")
|
||||
return
|
||||
|
||||
# 1. Add update_time column if it doesn't exist
|
||||
check_column_sql = """
|
||||
SELECT column_name
|
||||
FROM information_schema.columns
|
||||
WHERE table_name = $1
|
||||
AND column_name = 'update_time'
|
||||
AND table_schema = 'public'
|
||||
"""
|
||||
column_exists = await self.query(check_column_sql, [table_name.lower()])
|
||||
|
||||
if not column_exists:
|
||||
try:
|
||||
# Three-step migration to add update_time column:
|
||||
# 1. Add column WITHOUT default - avoids full table rewrite on large tables
|
||||
# 2. Backfill existing rows with create_time values
|
||||
# 3. Set DEFAULT for future inserts
|
||||
# Note: There's a tiny race window between steps 1-3 where concurrent
|
||||
# inserts could get NULL. This is acceptable for this migration use case.
|
||||
#
|
||||
# Step 1: Add column WITHOUT default (existing rows get NULL)
|
||||
add_column_sql = f"""
|
||||
ALTER TABLE {table_name}
|
||||
ADD COLUMN update_time TIMESTAMP(0)
|
||||
"""
|
||||
await self.execute(add_column_sql)
|
||||
logger.info(f"PostgreSQL, Added update_time column to {table_name}")
|
||||
|
||||
# Step 2: Set existing rows' update_time to their create_time
|
||||
update_sql = f"""
|
||||
UPDATE {table_name}
|
||||
SET update_time = create_time
|
||||
WHERE update_time IS NULL
|
||||
"""
|
||||
await self.execute(update_sql)
|
||||
logger.info(
|
||||
f"PostgreSQL, Initialized update_time values in {table_name}"
|
||||
)
|
||||
|
||||
# Step 3: Set default for future rows
|
||||
set_default_sql = f"""
|
||||
ALTER TABLE {table_name}
|
||||
ALTER COLUMN update_time SET DEFAULT CURRENT_TIMESTAMP
|
||||
"""
|
||||
await self.execute(set_default_sql)
|
||||
logger.info(
|
||||
f"PostgreSQL, Set default for update_time column in {table_name}"
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"PostgreSQL, Failed to add update_time column to {table_name}: {e}"
|
||||
)
|
||||
|
||||
# 2. Create index on (workspace, canonical_entity) for get_aliases_for_canonical query
|
||||
index_name = "idx_lightrag_entity_aliases_canonical"
|
||||
check_index_sql = """
|
||||
SELECT indexname
|
||||
FROM pg_indexes
|
||||
WHERE tablename = $1
|
||||
AND indexname = $2
|
||||
"""
|
||||
index_exists = await self.query(
|
||||
check_index_sql, [table_name.lower(), index_name]
|
||||
)
|
||||
|
||||
if not index_exists:
|
||||
try:
|
||||
create_index_sql = f"""
|
||||
CREATE INDEX {index_name}
|
||||
ON {table_name} (workspace, canonical_entity)
|
||||
"""
|
||||
await self.execute(create_index_sql)
|
||||
logger.info(f"PostgreSQL, Created index {index_name} on {table_name}")
|
||||
except Exception as e:
|
||||
logger.error(f"PostgreSQL, Failed to create index {index_name}: {e}")
|
||||
|
||||
# 3. Add CHECK constraint for confidence range if it doesn't exist
|
||||
constraint_name = "confidence_range"
|
||||
check_constraint_sql = """
|
||||
SELECT constraint_name
|
||||
FROM information_schema.table_constraints
|
||||
WHERE table_name = $1
|
||||
AND constraint_name = $2
|
||||
AND constraint_type = 'CHECK'
|
||||
AND table_schema = 'public'
|
||||
"""
|
||||
constraint_exists = await self.query(
|
||||
check_constraint_sql, [table_name.lower(), constraint_name]
|
||||
)
|
||||
|
||||
if not constraint_exists:
|
||||
try:
|
||||
add_constraint_sql = f"""
|
||||
ALTER TABLE {table_name}
|
||||
ADD CONSTRAINT {constraint_name}
|
||||
CHECK (confidence >= 0 AND confidence <= 1)
|
||||
"""
|
||||
await self.execute(add_constraint_sql)
|
||||
logger.info(
|
||||
f"PostgreSQL, Added CHECK constraint {constraint_name} to {table_name}"
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"PostgreSQL, Failed to add CHECK constraint {constraint_name} to {table_name}: {e}"
|
||||
)
|
||||
|
||||
async def _create_pagination_indexes(self):
|
||||
"""Create indexes to optimize pagination queries for LIGHTRAG_DOC_STATUS"""
|
||||
indexes = [
|
||||
|
|
@ -1402,7 +1536,7 @@ class PostgreSQLDB:
|
|||
"VCHORDRQ": f"""
|
||||
CREATE INDEX {{vector_index_name}}
|
||||
ON {{k}} USING vchordrq (content_vector vector_cosine_ops)
|
||||
{f'WITH (options = $${self.vchordrq_build_options}$$)' if self.vchordrq_build_options else ''}
|
||||
{f"WITH (options = $${self.vchordrq_build_options}$$)" if self.vchordrq_build_options else ""}
|
||||
""",
|
||||
}
|
||||
|
||||
|
|
@ -4906,6 +5040,19 @@ TABLES = {
|
|||
CONSTRAINT LIGHTRAG_RELATION_CHUNKS_PK PRIMARY KEY (workspace, id)
|
||||
)"""
|
||||
},
|
||||
"LIGHTRAG_ENTITY_ALIASES": {
|
||||
"ddl": """CREATE TABLE LIGHTRAG_ENTITY_ALIASES (
|
||||
workspace VARCHAR(255),
|
||||
alias VARCHAR(512),
|
||||
canonical_entity VARCHAR(512),
|
||||
method VARCHAR(50),
|
||||
confidence FLOAT,
|
||||
create_time TIMESTAMP(0) DEFAULT CURRENT_TIMESTAMP,
|
||||
update_time TIMESTAMP(0) DEFAULT CURRENT_TIMESTAMP,
|
||||
CONSTRAINT LIGHTRAG_ENTITY_ALIASES_PK PRIMARY KEY (workspace, alias),
|
||||
CONSTRAINT confidence_range CHECK (confidence >= 0 AND confidence <= 1)
|
||||
)"""
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -5117,4 +5264,25 @@ SQL_TEMPLATES = {
|
|||
"drop_specifiy_table_workspace": """
|
||||
DELETE FROM {table_name} WHERE workspace=$1
|
||||
""",
|
||||
# Entity alias cache
|
||||
"get_alias": """
|
||||
SELECT canonical_entity, method, confidence
|
||||
FROM LIGHTRAG_ENTITY_ALIASES
|
||||
WHERE workspace=$1 AND alias=$2
|
||||
""",
|
||||
"upsert_alias": """
|
||||
INSERT INTO LIGHTRAG_ENTITY_ALIASES
|
||||
(workspace, alias, canonical_entity, method, confidence, create_time, update_time)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $6)
|
||||
ON CONFLICT (workspace, alias) DO UPDATE SET
|
||||
canonical_entity = EXCLUDED.canonical_entity,
|
||||
method = EXCLUDED.method,
|
||||
confidence = EXCLUDED.confidence,
|
||||
update_time = CURRENT_TIMESTAMP
|
||||
""",
|
||||
"get_aliases_for_canonical": """
|
||||
SELECT alias, method, confidence
|
||||
FROM LIGHTRAG_ENTITY_ALIASES
|
||||
WHERE workspace=$1 AND canonical_entity=$2
|
||||
""",
|
||||
}
|
||||
|
|
|
|||
|
|
@ -26,6 +26,7 @@ from typing import (
|
|||
)
|
||||
from lightrag.prompt import PROMPTS
|
||||
from lightrag.exceptions import PipelineCancelledException
|
||||
from lightrag.entity_resolution import EntityResolutionConfig
|
||||
from lightrag.constants import (
|
||||
DEFAULT_MAX_GLEANING,
|
||||
DEFAULT_FORCE_LLM_SUMMARY_ON_MERGE,
|
||||
|
|
@ -217,6 +218,11 @@ class LightRAG:
|
|||
)
|
||||
)
|
||||
|
||||
entity_resolution_config: EntityResolutionConfig | None = field(default=None)
|
||||
"""Configuration for entity resolution (deduplication).
|
||||
Set to EntityResolutionConfig() to enable, or None to disable.
|
||||
Resolves entities like 'FDA' → 'US Food and Drug Administration'."""
|
||||
|
||||
# Text chunking
|
||||
# ---
|
||||
|
||||
|
|
|
|||
|
|
@ -5,6 +5,7 @@ from pathlib import Path
|
|||
import asyncio
|
||||
import json
|
||||
import json_repair
|
||||
import re
|
||||
from typing import Any, AsyncIterator, overload, Literal
|
||||
from collections import Counter, defaultdict
|
||||
|
||||
|
|
@ -50,6 +51,13 @@ from lightrag.base import (
|
|||
QueryContextResult,
|
||||
)
|
||||
from lightrag.prompt import PROMPTS
|
||||
from lightrag.entity_resolution import (
|
||||
resolve_entity_with_vdb,
|
||||
get_cached_alias,
|
||||
store_alias,
|
||||
fuzzy_similarity,
|
||||
EntityResolutionConfig,
|
||||
)
|
||||
from lightrag.constants import (
|
||||
GRAPH_FIELD_SEP,
|
||||
DEFAULT_MAX_ENTITY_TOKENS,
|
||||
|
|
@ -1590,6 +1598,180 @@ async def _rebuild_single_relationship(
|
|||
pipeline_status["history_messages"].append(status_message)
|
||||
|
||||
|
||||
def _has_different_numeric_suffix(name_a: str, name_b: str) -> bool:
|
||||
"""Check if two names have different numeric components.
|
||||
|
||||
This prevents false fuzzy matches between entities that differ only by number,
|
||||
such as "Interleukin-4" vs "Interleukin-13" (88.9% similar but semantically distinct).
|
||||
|
||||
Scientific/medical entities often use numbers as key identifiers:
|
||||
- Interleukins: IL-4, IL-13, IL-17
|
||||
- Drug phases: Phase 1, Phase 2, Phase 3
|
||||
- Receptor types: Type 1, Type 2
|
||||
- Versions: v1.0, v2.0
|
||||
|
||||
Args:
|
||||
name_a: First entity name
|
||||
name_b: Second entity name
|
||||
|
||||
Returns:
|
||||
True if both names contain numbers but the numbers differ, False otherwise.
|
||||
"""
|
||||
# Extract all numeric patterns (integers and decimals)
|
||||
pattern = r"(\d+(?:\.\d+)?)"
|
||||
nums_a = re.findall(pattern, name_a)
|
||||
nums_b = re.findall(pattern, name_b)
|
||||
|
||||
# If both have numbers and they differ, these are likely distinct entities
|
||||
if nums_a and nums_b and nums_a != nums_b:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
async def _build_pre_resolution_map(
|
||||
entity_names: list[str],
|
||||
entity_types: dict[str, str],
|
||||
entity_vdb,
|
||||
llm_fn,
|
||||
config: EntityResolutionConfig,
|
||||
) -> tuple[dict[str, str], dict[str, float]]:
|
||||
"""Build resolution map before parallel processing to prevent race conditions.
|
||||
|
||||
This function resolves entities against each other within the batch (using
|
||||
instant fuzzy matching) and against existing VDB entries. The resulting map
|
||||
is applied during parallel entity processing.
|
||||
|
||||
Args:
|
||||
entity_names: List of entity names to resolve
|
||||
entity_types: Dict mapping entity names to their types (e.g., "person", "organization").
|
||||
Used to prevent fuzzy matching between entities of different types.
|
||||
entity_vdb: Entity vector database for checking existing entities
|
||||
llm_fn: LLM function for semantic verification
|
||||
config: Entity resolution configuration
|
||||
|
||||
Returns:
|
||||
Tuple of:
|
||||
- resolution_map: Dict mapping original entity names to their resolved canonical names.
|
||||
Only entities that need remapping are included.
|
||||
- confidence_map: Dict mapping alias to confidence score (1.0 for exact, actual
|
||||
similarity for fuzzy, result.confidence for VDB matches).
|
||||
"""
|
||||
resolution_map: dict[str, str] = {}
|
||||
confidence_map: dict[str, float] = {}
|
||||
# Track canonical entities with their types: [(name, type), ...]
|
||||
canonical_entities: list[tuple[str, str]] = []
|
||||
|
||||
for entity_name in entity_names:
|
||||
normalized = entity_name.lower().strip()
|
||||
entity_type = entity_types.get(entity_name, "")
|
||||
|
||||
# Skip if already resolved to something in this batch
|
||||
if entity_name in resolution_map:
|
||||
continue
|
||||
|
||||
# Layer 1: Case-insensitive exact match within batch
|
||||
matched = False
|
||||
for canonical, canonical_type in canonical_entities:
|
||||
if canonical.lower().strip() == normalized:
|
||||
resolution_map[entity_name] = canonical
|
||||
confidence_map[entity_name] = 1.0 # Exact match = perfect confidence
|
||||
logger.debug(
|
||||
f"Pre-resolution (case match): '{entity_name}' → '{canonical}'"
|
||||
)
|
||||
matched = True
|
||||
break
|
||||
|
||||
if matched:
|
||||
continue
|
||||
|
||||
# Layer 2: Fuzzy match within batch (catches typos like Dupixant→Dupixent)
|
||||
# Only enabled when config.fuzzy_pre_resolution_enabled is True.
|
||||
# Requires: similarity >= threshold AND matching types (or unknown).
|
||||
if config.fuzzy_pre_resolution_enabled:
|
||||
for canonical, canonical_type in canonical_entities:
|
||||
similarity = fuzzy_similarity(entity_name, canonical)
|
||||
if similarity >= config.fuzzy_threshold:
|
||||
# Type compatibility check: skip if types differ and both known.
|
||||
# Empty/unknown types are treated as compatible to avoid
|
||||
# blocking legitimate matches when type info is incomplete.
|
||||
types_compatible = (
|
||||
not entity_type
|
||||
or not canonical_type
|
||||
or entity_type == canonical_type
|
||||
)
|
||||
if not types_compatible:
|
||||
logger.debug(
|
||||
f"Pre-resolution (fuzzy {similarity:.2f}): SKIPPED "
|
||||
f"'{entity_name}' ({entity_type}) → "
|
||||
f"'{canonical}' ({canonical_type}) - type mismatch"
|
||||
)
|
||||
continue
|
||||
|
||||
# Numeric suffix check: skip if names have different numbers
|
||||
# This prevents false matches like "Interleukin-4" → "Interleukin-13"
|
||||
# where fuzzy similarity is high (88.9%) but entities are distinct
|
||||
if _has_different_numeric_suffix(entity_name, canonical):
|
||||
logger.debug(
|
||||
f"Pre-resolution (fuzzy {similarity:.2f}): SKIPPED "
|
||||
f"'{entity_name}' → '{canonical}' - different numeric suffix"
|
||||
)
|
||||
continue
|
||||
|
||||
# Accept the fuzzy match - emit warning for review
|
||||
resolution_map[entity_name] = canonical
|
||||
confidence_map[entity_name] = (
|
||||
similarity # Use actual similarity score
|
||||
)
|
||||
etype_display = entity_type or "unknown"
|
||||
ctype_display = canonical_type or "unknown"
|
||||
logger.warning(
|
||||
f"Fuzzy pre-resolution accepted: '{entity_name}' → "
|
||||
f"'{canonical}' (similarity={similarity:.3f}, "
|
||||
f"types: {etype_display}→{ctype_display}). "
|
||||
f"Review for correctness; adjust fuzzy_threshold or "
|
||||
f"disable fuzzy_pre_resolution_enabled if needed."
|
||||
)
|
||||
matched = True
|
||||
break
|
||||
|
||||
if matched:
|
||||
continue
|
||||
|
||||
# Layer 3: Check existing VDB for cross-document deduplication
|
||||
if entity_vdb and llm_fn:
|
||||
try:
|
||||
result = await resolve_entity_with_vdb(
|
||||
entity_name, entity_vdb, llm_fn, config
|
||||
)
|
||||
if result.action == "match" and result.matched_entity:
|
||||
resolution_map[entity_name] = result.matched_entity
|
||||
confidence_map[entity_name] = (
|
||||
result.confidence
|
||||
) # Use VDB result confidence
|
||||
# Add canonical from VDB so batch entities can match it.
|
||||
# VDB matches don't have type info available, use empty.
|
||||
canonical_entities.append((result.matched_entity, ""))
|
||||
logger.debug(
|
||||
f"Pre-resolution (VDB {result.method}): "
|
||||
f"'{entity_name}' → '{result.matched_entity}'"
|
||||
)
|
||||
continue
|
||||
except Exception as e:
|
||||
logger.debug(
|
||||
f"Pre-resolution VDB check failed for '{entity_name}': {e}"
|
||||
)
|
||||
|
||||
# No match found - this is a new canonical entity
|
||||
canonical_entities.append((entity_name, entity_type))
|
||||
|
||||
if resolution_map:
|
||||
logger.info(
|
||||
f"Pre-resolution: {len(resolution_map)} entities mapped to canonical forms"
|
||||
)
|
||||
|
||||
return resolution_map, confidence_map
|
||||
|
||||
|
||||
async def _merge_nodes_then_upsert(
|
||||
entity_name: str,
|
||||
nodes_data: list[dict],
|
||||
|
|
@ -1600,8 +1782,128 @@ async def _merge_nodes_then_upsert(
|
|||
pipeline_status_lock=None,
|
||||
llm_response_cache: BaseKVStorage | None = None,
|
||||
entity_chunks_storage: BaseKVStorage | None = None,
|
||||
):
|
||||
"""Get existing nodes from knowledge graph use name,if exists, merge data, else create, then upsert."""
|
||||
pre_resolution_map: dict[str, str] | None = None,
|
||||
) -> tuple[dict, str | None]:
|
||||
"""Get existing nodes from knowledge graph use name,if exists, merge data, else create, then upsert.
|
||||
|
||||
Returns:
|
||||
Tuple of (node_data, original_entity_name). original_entity_name is set if
|
||||
entity resolution changed the name (e.g., "Dupixant" → "Dupixent"),
|
||||
otherwise None.
|
||||
"""
|
||||
original_entity_name = entity_name # Track original before resolution
|
||||
|
||||
# Apply pre-resolution map immediately (prevents race conditions in parallel processing)
|
||||
pre_resolved = False
|
||||
if pre_resolution_map and entity_name in pre_resolution_map:
|
||||
entity_name = pre_resolution_map[entity_name]
|
||||
pre_resolved = True
|
||||
logger.debug(
|
||||
f"Applied pre-resolution: '{original_entity_name}' → '{entity_name}'"
|
||||
)
|
||||
|
||||
# Entity Resolution: Resolve new entity against existing entities
|
||||
# Skip if already pre-resolved (to avoid redundant VDB queries)
|
||||
entity_resolution_config_raw = global_config.get("entity_resolution_config")
|
||||
entity_resolution_config = None
|
||||
if entity_resolution_config_raw:
|
||||
# Handle both dict (from asdict() serialization) and EntityResolutionConfig instances
|
||||
if isinstance(entity_resolution_config_raw, EntityResolutionConfig):
|
||||
entity_resolution_config = entity_resolution_config_raw
|
||||
elif isinstance(entity_resolution_config_raw, dict):
|
||||
try:
|
||||
entity_resolution_config = EntityResolutionConfig(
|
||||
**entity_resolution_config_raw
|
||||
)
|
||||
except TypeError as e:
|
||||
logger.warning(
|
||||
f"Invalid entity_resolution_config: {e}. "
|
||||
f"Config: {entity_resolution_config_raw}. Skipping resolution."
|
||||
)
|
||||
|
||||
# Safely check if entity resolution is enabled, handling both object and dict forms
|
||||
def _is_resolution_enabled(config) -> bool:
|
||||
if config is None:
|
||||
return False
|
||||
if isinstance(config, dict):
|
||||
return config.get("enabled", False)
|
||||
return getattr(config, "enabled", False)
|
||||
|
||||
# Skip VDB resolution if entity was already pre-resolved (prevents redundant queries)
|
||||
if (
|
||||
_is_resolution_enabled(entity_resolution_config)
|
||||
and entity_vdb
|
||||
and not pre_resolved
|
||||
):
|
||||
original_name = entity_name
|
||||
workspace = global_config.get("workspace", "")
|
||||
# Try knowledge_graph_inst.db first (more reliable), fallback to entity_vdb.db
|
||||
db = getattr(knowledge_graph_inst, "db", None) or getattr(
|
||||
entity_vdb, "db", None
|
||||
)
|
||||
|
||||
# Layer 0: Check alias cache first (PostgreSQL-only - requires db connection)
|
||||
# Note: Alias caching is only available when using PostgreSQL storage backend
|
||||
if db is not None:
|
||||
try:
|
||||
cached = await get_cached_alias(original_name, db, workspace)
|
||||
if cached:
|
||||
canonical, method, _ = cached
|
||||
logger.debug(
|
||||
f"Alias cache hit: '{original_name}' → '{canonical}' "
|
||||
f"(method: {method})"
|
||||
)
|
||||
entity_name = canonical
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"Entity resolution cache lookup failed for '{original_name}' "
|
||||
f"(workspace: {workspace}): {type(e).__name__}: {e}. "
|
||||
"Continuing without cache."
|
||||
)
|
||||
|
||||
# Layers 1-3: Full VDB resolution (if not found in cache)
|
||||
if entity_name == original_name:
|
||||
llm_fn = global_config.get("llm_model_func")
|
||||
if llm_fn:
|
||||
try:
|
||||
resolution = await resolve_entity_with_vdb(
|
||||
entity_name,
|
||||
entity_vdb,
|
||||
llm_fn,
|
||||
entity_resolution_config,
|
||||
)
|
||||
if resolution.action == "match" and resolution.matched_entity:
|
||||
logger.info(
|
||||
f"Entity resolution: '{entity_name}' → '{resolution.matched_entity}' "
|
||||
f"(method: {resolution.method}, confidence: {resolution.confidence:.2f})"
|
||||
)
|
||||
entity_name = resolution.matched_entity
|
||||
|
||||
# Store in alias cache for next time (PostgreSQL-only)
|
||||
# Note: Alias caching requires PostgreSQL storage backend
|
||||
if db is not None:
|
||||
try:
|
||||
await store_alias(
|
||||
original_name,
|
||||
entity_name,
|
||||
resolution.method,
|
||||
resolution.confidence,
|
||||
db,
|
||||
workspace,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"Failed to store entity alias '{original_name}' → '{entity_name}' "
|
||||
f"(workspace: {workspace}): {type(e).__name__}: {e}. "
|
||||
"Resolution succeeded but cache not updated."
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"Entity resolution failed for '{original_name}' "
|
||||
f"(workspace: {workspace}): {type(e).__name__}: {e}. "
|
||||
"Continuing with original entity name."
|
||||
)
|
||||
|
||||
already_entity_types = []
|
||||
already_source_ids = []
|
||||
already_description = []
|
||||
|
|
@ -1865,7 +2167,12 @@ async def _merge_nodes_then_upsert(
|
|||
max_retries=3,
|
||||
retry_delay=0.1,
|
||||
)
|
||||
return node_data
|
||||
|
||||
# Return original name if resolution changed it, None otherwise
|
||||
resolved_from = (
|
||||
original_entity_name if entity_name != original_entity_name else None
|
||||
)
|
||||
return node_data, resolved_from
|
||||
|
||||
|
||||
async def _merge_edges_then_upsert(
|
||||
|
|
@ -1882,7 +2189,13 @@ async def _merge_edges_then_upsert(
|
|||
added_entities: list = None, # New parameter to track entities added during edge processing
|
||||
relation_chunks_storage: BaseKVStorage | None = None,
|
||||
entity_chunks_storage: BaseKVStorage | None = None,
|
||||
entity_resolution_map: dict[str, str] | None = None, # Map original→resolved names
|
||||
):
|
||||
# Apply entity resolution mapping to edge endpoints
|
||||
if entity_resolution_map:
|
||||
src_id = entity_resolution_map.get(src_id, src_id)
|
||||
tgt_id = entity_resolution_map.get(tgt_id, tgt_id)
|
||||
|
||||
if src_id == tgt_id:
|
||||
return None
|
||||
|
||||
|
|
@ -2472,6 +2785,76 @@ async def merge_nodes_and_edges(
|
|||
graph_max_async = global_config.get("llm_model_max_async", 4) * 2
|
||||
semaphore = asyncio.Semaphore(graph_max_async)
|
||||
|
||||
# ===== Pre-Resolution Phase: Build entity resolution map =====
|
||||
# This prevents race conditions when parallel workers process similar entities
|
||||
# IMPORTANT: Include BOTH entity names AND relation endpoints to catch all duplicates
|
||||
pre_resolution_map: dict[str, str] = {}
|
||||
entity_resolution_config_raw = global_config.get("entity_resolution_config")
|
||||
if entity_resolution_config_raw:
|
||||
# Handle both dict (from asdict() serialization) and EntityResolutionConfig instances
|
||||
config = None
|
||||
if isinstance(entity_resolution_config_raw, EntityResolutionConfig):
|
||||
config = entity_resolution_config_raw
|
||||
elif isinstance(entity_resolution_config_raw, dict):
|
||||
try:
|
||||
config = EntityResolutionConfig(**entity_resolution_config_raw)
|
||||
except TypeError as e:
|
||||
logger.warning(
|
||||
f"Invalid entity_resolution_config: {e}. "
|
||||
f"Config: {entity_resolution_config_raw}. Skipping resolution."
|
||||
)
|
||||
if config and config.enabled:
|
||||
llm_fn = global_config.get("llm_model_func")
|
||||
# Build entity_types map for type-aware fuzzy matching.
|
||||
# Use first non-empty type for entities with multiple occurrences.
|
||||
entity_types: dict[str, str] = {}
|
||||
for entity_name, entities in all_nodes.items():
|
||||
for entity_data in entities:
|
||||
etype = entity_data.get("entity_type", "")
|
||||
if etype:
|
||||
entity_types[entity_name] = etype
|
||||
break
|
||||
|
||||
# Collect ALL entity names: from entities AND from relation endpoints
|
||||
# This ensures relation endpoints like "EU Medicines Agency" get resolved
|
||||
# against existing entities like "European Medicines Agency"
|
||||
all_entity_names = set(all_nodes.keys())
|
||||
for src_id, tgt_id in all_edges.keys():
|
||||
all_entity_names.add(src_id)
|
||||
all_entity_names.add(tgt_id)
|
||||
|
||||
pre_resolution_map, confidence_map = await _build_pre_resolution_map(
|
||||
list(all_entity_names),
|
||||
entity_types,
|
||||
entity_vdb,
|
||||
llm_fn,
|
||||
config,
|
||||
)
|
||||
|
||||
# Cache pre-resolution aliases for future lookups (PostgreSQL-only)
|
||||
# This ensures aliases discovered during batch processing are available
|
||||
# for subsequent document ingestion without re-running resolution
|
||||
db = getattr(knowledge_graph_inst, "db", None)
|
||||
if db is not None and pre_resolution_map:
|
||||
workspace = global_config.get("workspace", "")
|
||||
for alias, canonical in pre_resolution_map.items():
|
||||
# Don't cache self-references (entity → itself)
|
||||
if alias.lower().strip() != canonical.lower().strip():
|
||||
try:
|
||||
await store_alias(
|
||||
alias=alias,
|
||||
canonical=canonical,
|
||||
method="pre_resolution",
|
||||
confidence=confidence_map.get(alias, 1.0),
|
||||
db=db,
|
||||
workspace=workspace,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.debug(
|
||||
f"Failed to cache pre-resolution alias "
|
||||
f"'{alias}' → '{canonical}': {e}"
|
||||
)
|
||||
|
||||
# ===== Phase 1: Process all entities concurrently =====
|
||||
log_message = f"Phase 1: Processing {total_entities_count} entities from {doc_id} (async: {graph_max_async})"
|
||||
logger.info(log_message)
|
||||
|
|
@ -2479,6 +2862,11 @@ async def merge_nodes_and_edges(
|
|||
pipeline_status["latest_message"] = log_message
|
||||
pipeline_status["history_messages"].append(log_message)
|
||||
|
||||
# Resolution map to track original→resolved entity names (e.g., "Dupixant"→"Dupixent")
|
||||
# This will be used to remap edge endpoints in Phase 2
|
||||
entity_resolution_map: dict[str, str] = {}
|
||||
resolution_map_lock = asyncio.Lock()
|
||||
|
||||
async def _locked_process_entity_name(entity_name, entities):
|
||||
async with semaphore:
|
||||
# Check for cancellation before processing entity
|
||||
|
|
@ -2496,7 +2884,7 @@ async def merge_nodes_and_edges(
|
|||
):
|
||||
try:
|
||||
logger.debug(f"Processing entity {entity_name}")
|
||||
entity_data = await _merge_nodes_then_upsert(
|
||||
entity_data, resolved_from = await _merge_nodes_then_upsert(
|
||||
entity_name,
|
||||
entities,
|
||||
knowledge_graph_inst,
|
||||
|
|
@ -2506,8 +2894,15 @@ async def merge_nodes_and_edges(
|
|||
pipeline_status_lock,
|
||||
llm_response_cache,
|
||||
entity_chunks_storage,
|
||||
pre_resolution_map,
|
||||
)
|
||||
|
||||
# Track resolution mapping for edge remapping in Phase 2
|
||||
if resolved_from is not None:
|
||||
resolved_to = entity_data.get("entity_name", entity_name)
|
||||
async with resolution_map_lock:
|
||||
entity_resolution_map[resolved_from] = resolved_to
|
||||
|
||||
return entity_data
|
||||
|
||||
except Exception as e:
|
||||
|
|
@ -2617,6 +3012,7 @@ async def merge_nodes_and_edges(
|
|||
added_entities, # Pass list to collect added entities
|
||||
relation_chunks_storage,
|
||||
entity_chunks_storage, # Add entity_chunks_storage parameter
|
||||
entity_resolution_map, # Apply entity resolution to edge endpoints
|
||||
)
|
||||
|
||||
if edge_data is None:
|
||||
|
|
@ -2649,9 +3045,36 @@ async def merge_nodes_and_edges(
|
|||
raise prefixed_exception from e
|
||||
|
||||
# Create relationship processing tasks
|
||||
edge_tasks = []
|
||||
# Apply pre_resolution_map to edge endpoints to prevent duplicates from relation extraction
|
||||
# Key fixes: sort for lock ordering, filter self-loops, deduplicate merged edges
|
||||
resolved_edges: dict[tuple[str, str], list] = {}
|
||||
for edge_key, edges in all_edges.items():
|
||||
task = asyncio.create_task(_locked_process_edges(edge_key, edges))
|
||||
# Remap edge endpoints using pre-resolution map
|
||||
# This catches cases like "EU Medicines Agency" → "European Medicines Agency"
|
||||
resolved_src = pre_resolution_map.get(edge_key[0], edge_key[0])
|
||||
resolved_tgt = pre_resolution_map.get(edge_key[1], edge_key[1])
|
||||
|
||||
# Skip self-loops created by resolution (e.g., both endpoints resolve to same entity)
|
||||
if resolved_src == resolved_tgt:
|
||||
logger.debug(
|
||||
f"Skipping self-loop after resolution: {edge_key} → ({resolved_src}, {resolved_tgt})"
|
||||
)
|
||||
continue
|
||||
|
||||
# Sort for consistent lock ordering (prevents deadlocks)
|
||||
resolved_edge_key = tuple(sorted([resolved_src, resolved_tgt]))
|
||||
|
||||
# Merge edges that resolve to same key (deduplication)
|
||||
if resolved_edge_key not in resolved_edges:
|
||||
resolved_edges[resolved_edge_key] = []
|
||||
resolved_edges[resolved_edge_key].extend(edges)
|
||||
|
||||
# Create tasks from deduplicated edges
|
||||
edge_tasks = []
|
||||
for resolved_edge_key, merged_edges in resolved_edges.items():
|
||||
task = asyncio.create_task(
|
||||
_locked_process_edges(resolved_edge_key, merged_edges)
|
||||
)
|
||||
edge_tasks.append(task)
|
||||
|
||||
# Execute relationship tasks with error handling
|
||||
|
|
|
|||
|
|
@ -2,7 +2,6 @@ import { useState, useCallback, useEffect, useRef } from 'react'
|
|||
import ThemeProvider from '@/components/ThemeProvider'
|
||||
import TabVisibilityProvider from '@/contexts/TabVisibilityProvider'
|
||||
import ApiKeyAlert from '@/components/ApiKeyAlert'
|
||||
import StatusIndicator from '@/components/status/StatusIndicator'
|
||||
import { SiteInfo, webuiPrefix } from '@/lib/constants'
|
||||
import { useBackendState, useAuthStore } from '@/stores/state'
|
||||
import { useSettingsStore } from '@/stores/settings'
|
||||
|
|
@ -218,7 +217,6 @@ function App() {
|
|||
</TabsContent>
|
||||
</div>
|
||||
</Tabs>
|
||||
{enableHealthCheck && <StatusIndicator />}
|
||||
<ApiKeyAlert open={apiKeyAlertOpen} onOpenChange={handleApiKeyAlertOpenChange} />
|
||||
</main>
|
||||
)}
|
||||
|
|
|
|||
|
|
@ -1,9 +1,7 @@
|
|||
import { useState, useCallback } from 'react'
|
||||
import { Popover, PopoverContent, PopoverTrigger } from '@/components/ui/Popover'
|
||||
import { useCallback } from 'react'
|
||||
import Button from '@/components/ui/Button'
|
||||
import { Select, SelectContent, SelectItem, SelectTrigger, SelectValue } from '@/components/ui/Select'
|
||||
import { useSettingsStore } from '@/stores/settings'
|
||||
import { PaletteIcon } from 'lucide-react'
|
||||
import { SunIcon, MoonIcon } from 'lucide-react'
|
||||
import { useTranslation } from 'react-i18next'
|
||||
import { cn } from '@/lib/utils'
|
||||
|
||||
|
|
@ -12,63 +10,39 @@ interface AppSettingsProps {
|
|||
}
|
||||
|
||||
export default function AppSettings({ className }: AppSettingsProps) {
|
||||
const [opened, setOpened] = useState<boolean>(false)
|
||||
const { t } = useTranslation()
|
||||
|
||||
const language = useSettingsStore.use.language()
|
||||
const setLanguage = useSettingsStore.use.setLanguage()
|
||||
|
||||
const theme = useSettingsStore.use.theme()
|
||||
const setTheme = useSettingsStore.use.setTheme()
|
||||
|
||||
const handleLanguageChange = useCallback((value: string) => {
|
||||
setLanguage(value as 'en' | 'zh' | 'fr' | 'ar' | 'zh_TW')
|
||||
}, [setLanguage])
|
||||
// Compute effective theme for icon/tooltip display when theme is 'system'
|
||||
const effectiveTheme = theme === 'system'
|
||||
? (window.matchMedia('(prefers-color-scheme: dark)').matches ? 'dark' : 'light')
|
||||
: theme
|
||||
|
||||
const handleThemeChange = useCallback((value: string) => {
|
||||
setTheme(value as 'light' | 'dark' | 'system')
|
||||
}, [setTheme])
|
||||
const handleThemeToggle = useCallback(() => {
|
||||
if (theme === 'system') {
|
||||
// Detect actual system preference and toggle to opposite
|
||||
const isDark = window.matchMedia('(prefers-color-scheme: dark)').matches
|
||||
setTheme(isDark ? 'light' : 'dark')
|
||||
} else {
|
||||
setTheme(theme === 'dark' ? 'light' : 'dark')
|
||||
}
|
||||
}, [theme, setTheme])
|
||||
|
||||
return (
|
||||
<Popover open={opened} onOpenChange={setOpened}>
|
||||
<PopoverTrigger asChild>
|
||||
<Button variant="ghost" size="icon" className={cn('h-9 w-9', className)}>
|
||||
<PaletteIcon className="h-5 w-5" />
|
||||
</Button>
|
||||
</PopoverTrigger>
|
||||
<PopoverContent side="bottom" align="end" className="w-56">
|
||||
<div className="flex flex-col gap-4">
|
||||
<div className="flex flex-col gap-2">
|
||||
<label className="text-sm font-medium">{t('settings.language')}</label>
|
||||
<Select value={language} onValueChange={handleLanguageChange}>
|
||||
<SelectTrigger>
|
||||
<SelectValue />
|
||||
</SelectTrigger>
|
||||
<SelectContent>
|
||||
<SelectItem value="en">English</SelectItem>
|
||||
<SelectItem value="zh">中文</SelectItem>
|
||||
<SelectItem value="fr">Français</SelectItem>
|
||||
<SelectItem value="ar">العربية</SelectItem>
|
||||
<SelectItem value="zh_TW">繁體中文</SelectItem>
|
||||
</SelectContent>
|
||||
</Select>
|
||||
</div>
|
||||
|
||||
<div className="flex flex-col gap-2">
|
||||
<label className="text-sm font-medium">{t('settings.theme')}</label>
|
||||
<Select value={theme} onValueChange={handleThemeChange}>
|
||||
<SelectTrigger>
|
||||
<SelectValue />
|
||||
</SelectTrigger>
|
||||
<SelectContent>
|
||||
<SelectItem value="light">{t('settings.light')}</SelectItem>
|
||||
<SelectItem value="dark">{t('settings.dark')}</SelectItem>
|
||||
<SelectItem value="system">{t('settings.system')}</SelectItem>
|
||||
</SelectContent>
|
||||
</Select>
|
||||
</div>
|
||||
</div>
|
||||
</PopoverContent>
|
||||
</Popover>
|
||||
<Button
|
||||
variant="ghost"
|
||||
size="icon"
|
||||
className={cn('h-9 w-9', className)}
|
||||
onClick={handleThemeToggle}
|
||||
tooltip={effectiveTheme === 'dark' ? t('settings.light') : t('settings.dark')}
|
||||
>
|
||||
{effectiveTheme === 'dark' ? (
|
||||
<MoonIcon className="h-5 w-5" />
|
||||
) : (
|
||||
<SunIcon className="h-5 w-5" />
|
||||
)}
|
||||
</Button>
|
||||
)
|
||||
}
|
||||
|
|
|
|||
|
|
@ -4,7 +4,7 @@ import { useEffect, useState } from 'react'
|
|||
import StatusDialog from './StatusDialog'
|
||||
import { useTranslation } from 'react-i18next'
|
||||
|
||||
const StatusIndicator = () => {
|
||||
const StatusIndicator = ({ className }: { className?: string }) => {
|
||||
const { t } = useTranslation()
|
||||
const health = useBackendState.use.health()
|
||||
const lastCheckTime = useBackendState.use.lastCheckTime()
|
||||
|
|
@ -20,7 +20,7 @@ const StatusIndicator = () => {
|
|||
}, [lastCheckTime])
|
||||
|
||||
return (
|
||||
<div className="fixed right-4 bottom-4 flex items-center gap-2 opacity-80 select-none">
|
||||
<div className={cn("flex items-center gap-2 opacity-80 select-none", className)}>
|
||||
<div
|
||||
className="flex cursor-pointer items-center gap-2"
|
||||
onClick={() => setDialogOpen(true)}
|
||||
|
|
|
|||
|
|
@ -1,13 +1,14 @@
|
|||
import Button from '@/components/ui/Button'
|
||||
import { SiteInfo, webuiPrefix } from '@/lib/constants'
|
||||
import { webuiPrefix } from '@/lib/constants'
|
||||
import AppSettings from '@/components/AppSettings'
|
||||
import StatusIndicator from '@/components/status/StatusIndicator'
|
||||
import { TabsList, TabsTrigger } from '@/components/ui/Tabs'
|
||||
import { useSettingsStore } from '@/stores/settings'
|
||||
import { useAuthStore } from '@/stores/state'
|
||||
import { cn } from '@/lib/utils'
|
||||
import { useTranslation } from 'react-i18next'
|
||||
import { navigationService } from '@/services/navigation'
|
||||
import { ZapIcon, GithubIcon, LogOutIcon } from 'lucide-react'
|
||||
import { ZapIcon, LogOutIcon } from 'lucide-react'
|
||||
import { Tooltip, TooltipContent, TooltipProvider, TooltipTrigger } from '@/components/ui/Tooltip'
|
||||
|
||||
interface NavigationTabProps {
|
||||
|
|
@ -56,17 +57,8 @@ function TabsNavigation() {
|
|||
|
||||
export default function SiteHeader() {
|
||||
const { t } = useTranslation()
|
||||
const { isGuestMode, coreVersion, apiVersion, username, webuiTitle, webuiDescription } = useAuthStore()
|
||||
|
||||
const versionDisplay = (coreVersion && apiVersion)
|
||||
? `${coreVersion}/${apiVersion}`
|
||||
: null;
|
||||
|
||||
// Check if frontend needs rebuild (apiVersion ends with warning symbol)
|
||||
const hasWarning = apiVersion?.endsWith('⚠️');
|
||||
const versionTooltip = hasWarning
|
||||
? t('header.frontendNeedsRebuild')
|
||||
: versionDisplay ? `v${versionDisplay}` : '';
|
||||
const { isGuestMode, username, webuiTitle, webuiDescription } = useAuthStore()
|
||||
const enableHealthCheck = useSettingsStore.use.enableHealthCheck()
|
||||
|
||||
const handleLogout = () => {
|
||||
navigationService.navigateToLogin();
|
||||
|
|
@ -77,7 +69,6 @@ export default function SiteHeader() {
|
|||
<div className="min-w-[200px] w-auto flex items-center">
|
||||
<a href={webuiPrefix} className="flex items-center gap-2">
|
||||
<ZapIcon className="size-4 text-emerald-400" aria-hidden="true" />
|
||||
<span className="font-bold md:inline-block">{SiteInfo.name}</span>
|
||||
</a>
|
||||
{webuiTitle && (
|
||||
<div className="flex items-center">
|
||||
|
|
@ -111,25 +102,7 @@ export default function SiteHeader() {
|
|||
|
||||
<nav className="w-[200px] flex items-center justify-end">
|
||||
<div className="flex items-center gap-2">
|
||||
{versionDisplay && (
|
||||
<TooltipProvider>
|
||||
<Tooltip>
|
||||
<TooltipTrigger asChild>
|
||||
<span className="text-xs text-gray-500 dark:text-gray-400 mr-1 cursor-default">
|
||||
v{versionDisplay}
|
||||
</span>
|
||||
</TooltipTrigger>
|
||||
<TooltipContent side="bottom">
|
||||
{versionTooltip}
|
||||
</TooltipContent>
|
||||
</Tooltip>
|
||||
</TooltipProvider>
|
||||
)}
|
||||
<Button variant="ghost" size="icon" side="bottom" tooltip={t('header.projectRepository')}>
|
||||
<a href={SiteInfo.github} target="_blank" rel="noopener noreferrer">
|
||||
<GithubIcon className="size-4" aria-hidden="true" />
|
||||
</a>
|
||||
</Button>
|
||||
{enableHealthCheck && <StatusIndicator />}
|
||||
<AppSettings />
|
||||
{!isGuestMode && (
|
||||
<Button
|
||||
|
|
|
|||
|
|
@ -87,7 +87,7 @@ interface SettingsState {
|
|||
const useSettingsStoreBase = create<SettingsState>()(
|
||||
persist(
|
||||
(set) => ({
|
||||
theme: 'system',
|
||||
theme: 'light',
|
||||
language: 'en',
|
||||
showPropertyPanel: true,
|
||||
showNodeSearchBar: true,
|
||||
|
|
@ -238,7 +238,7 @@ const useSettingsStoreBase = create<SettingsState>()(
|
|||
{
|
||||
name: 'settings-storage',
|
||||
storage: createJSONStorage(() => localStorage),
|
||||
version: 19,
|
||||
version: 20,
|
||||
migrate: (state: any, version: number) => {
|
||||
if (version < 2) {
|
||||
state.showEdgeLabel = false
|
||||
|
|
@ -341,6 +341,15 @@ const useSettingsStoreBase = create<SettingsState>()(
|
|||
delete state.querySettings.response_type
|
||||
}
|
||||
}
|
||||
if (version < 20) {
|
||||
// Only set defaults if values are missing (preserve user preference)
|
||||
if (!state.theme) {
|
||||
state.theme = 'light'
|
||||
}
|
||||
if (!state.language) {
|
||||
state.language = 'en'
|
||||
}
|
||||
}
|
||||
return state
|
||||
}
|
||||
}
|
||||
|
|
|
|||
1
tests/test_entity_resolution/__init__.py
Normal file
1
tests/test_entity_resolution/__init__.py
Normal file
|
|
@ -0,0 +1 @@
|
|||
# Entity resolution tests
|
||||
240
tests/test_entity_resolution/test_resolver.py
Normal file
240
tests/test_entity_resolution/test_resolver.py
Normal file
|
|
@ -0,0 +1,240 @@
|
|||
"""
|
||||
Unit tests for Entity Resolution
|
||||
|
||||
Tests the 3-layer approach with mock embed_fn and llm_fn.
|
||||
No database or external services required.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
|
||||
from lightrag.entity_resolution import (
|
||||
EntityResolutionConfig,
|
||||
resolve_entity,
|
||||
)
|
||||
|
||||
# Mock embeddings - pre-computed for test entities
|
||||
# These simulate what an embedding model would return
|
||||
MOCK_EMBEDDINGS = {
|
||||
# FDA and full name have ~0.67 similarity (based on real test)
|
||||
"fda": [0.1, 0.2, 0.3, 0.4, 0.5],
|
||||
"us food and drug administration": [0.15, 0.25, 0.28, 0.38, 0.52],
|
||||
# Dupixent and dupilumab have ~0.63 similarity
|
||||
"dupixent": [0.5, 0.6, 0.7, 0.8, 0.9],
|
||||
"dupilumab": [0.48, 0.58, 0.72, 0.78, 0.88],
|
||||
# Celebrex and Cerebyx are different (low similarity)
|
||||
"celebrex": [0.9, 0.1, 0.2, 0.3, 0.4],
|
||||
"cerebyx": [0.1, 0.9, 0.8, 0.7, 0.6],
|
||||
# Default for unknown entities
|
||||
"default": [0.0, 0.0, 0.0, 0.0, 0.0],
|
||||
}
|
||||
|
||||
# Mock LLM responses
|
||||
MOCK_LLM_RESPONSES = {
|
||||
("fda", "us food and drug administration"): "YES",
|
||||
("us food and drug administration", "fda"): "YES",
|
||||
("dupixent", "dupilumab"): "YES",
|
||||
("dupilumab", "dupixent"): "YES",
|
||||
("heart attack", "myocardial infarction"): "YES",
|
||||
("celebrex", "cerebyx"): "NO",
|
||||
("metformin", "metoprolol"): "NO",
|
||||
}
|
||||
|
||||
|
||||
async def mock_embed_fn(text: str) -> list[float]:
|
||||
"""Mock embedding function."""
|
||||
key = text.lower().strip()
|
||||
return MOCK_EMBEDDINGS.get(key, MOCK_EMBEDDINGS["default"])
|
||||
|
||||
|
||||
async def mock_llm_fn(prompt: str) -> str:
|
||||
"""Mock LLM function that parses the prompt and returns YES/NO."""
|
||||
# Extract term_a and term_b from the prompt
|
||||
lines = prompt.strip().split("\n")
|
||||
term_a = None
|
||||
term_b = None
|
||||
for line in lines:
|
||||
if line.startswith("Term A:"):
|
||||
term_a = line.replace("Term A:", "").strip().lower()
|
||||
elif line.startswith("Term B:"):
|
||||
term_b = line.replace("Term B:", "").strip().lower()
|
||||
|
||||
if term_a and term_b:
|
||||
# Check both orderings
|
||||
response = MOCK_LLM_RESPONSES.get((term_a, term_b))
|
||||
if response is None:
|
||||
response = MOCK_LLM_RESPONSES.get((term_b, term_a), "NO")
|
||||
return response
|
||||
return "NO"
|
||||
|
||||
|
||||
# Test fixtures
|
||||
@pytest.fixture
|
||||
def existing_entities():
|
||||
"""Existing entities in the knowledge graph."""
|
||||
return [
|
||||
(
|
||||
"US Food and Drug Administration",
|
||||
MOCK_EMBEDDINGS["us food and drug administration"],
|
||||
),
|
||||
("Dupixent", MOCK_EMBEDDINGS["dupixent"]),
|
||||
("Celebrex", MOCK_EMBEDDINGS["celebrex"]),
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def config():
|
||||
"""Default resolution config."""
|
||||
return EntityResolutionConfig()
|
||||
|
||||
|
||||
# Layer 1: Case normalization tests
|
||||
class TestCaseNormalization:
|
||||
@pytest.mark.asyncio
|
||||
async def test_exact_match_same_case(self, existing_entities, config):
|
||||
"""Exact match with same case."""
|
||||
result = await resolve_entity(
|
||||
"Dupixent",
|
||||
existing_entities,
|
||||
mock_embed_fn,
|
||||
mock_llm_fn,
|
||||
config,
|
||||
)
|
||||
assert result.action == "match"
|
||||
assert result.matched_entity == "Dupixent"
|
||||
assert result.method == "exact"
|
||||
assert result.confidence == 1.0
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_exact_match_different_case(self, existing_entities, config):
|
||||
"""DUPIXENT should match Dupixent via case normalization."""
|
||||
result = await resolve_entity(
|
||||
"DUPIXENT",
|
||||
existing_entities,
|
||||
mock_embed_fn,
|
||||
mock_llm_fn,
|
||||
config,
|
||||
)
|
||||
assert result.action == "match"
|
||||
assert result.matched_entity == "Dupixent"
|
||||
assert result.method == "exact"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_exact_match_lowercase(self, existing_entities, config):
|
||||
"""dupixent should match Dupixent."""
|
||||
result = await resolve_entity(
|
||||
"dupixent",
|
||||
existing_entities,
|
||||
mock_embed_fn,
|
||||
mock_llm_fn,
|
||||
config,
|
||||
)
|
||||
assert result.action == "match"
|
||||
assert result.method == "exact"
|
||||
|
||||
|
||||
# Layer 2: Fuzzy matching tests
|
||||
class TestFuzzyMatching:
|
||||
@pytest.mark.asyncio
|
||||
async def test_fuzzy_match_typo(self, existing_entities, config):
|
||||
"""Dupixant (typo) should match Dupixent via fuzzy matching (88%)."""
|
||||
result = await resolve_entity(
|
||||
"Dupixant",
|
||||
existing_entities,
|
||||
mock_embed_fn,
|
||||
mock_llm_fn,
|
||||
config,
|
||||
)
|
||||
assert result.action == "match"
|
||||
assert result.matched_entity == "Dupixent"
|
||||
assert result.method == "fuzzy"
|
||||
assert result.confidence >= 0.85
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_fuzzy_rejects_below_threshold(self, existing_entities, config):
|
||||
"""Celebrex vs Cerebyx is 67% - should NOT fuzzy match."""
|
||||
# Add Cerebyx as the query (Celebrex exists)
|
||||
result = await resolve_entity(
|
||||
"Cerebyx",
|
||||
existing_entities,
|
||||
mock_embed_fn,
|
||||
mock_llm_fn,
|
||||
config,
|
||||
)
|
||||
# Should not be fuzzy match (67% < 85%)
|
||||
assert result.method != "fuzzy" or result.action == "new"
|
||||
|
||||
|
||||
# Layer 3: LLM verification tests
|
||||
class TestLLMVerification:
|
||||
@pytest.mark.asyncio
|
||||
async def test_llm_matches_acronym(self, existing_entities, config):
|
||||
"""FDA should match US Food and Drug Administration via LLM."""
|
||||
result = await resolve_entity(
|
||||
"FDA",
|
||||
existing_entities,
|
||||
mock_embed_fn,
|
||||
mock_llm_fn,
|
||||
config,
|
||||
)
|
||||
assert result.action == "match"
|
||||
assert result.matched_entity == "US Food and Drug Administration"
|
||||
assert result.method == "llm"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_llm_matches_brand_generic(self, config):
|
||||
"""Dupixent should match dupilumab via LLM."""
|
||||
existing = [
|
||||
("dupilumab", MOCK_EMBEDDINGS["dupilumab"]),
|
||||
]
|
||||
result = await resolve_entity(
|
||||
"Dupixent",
|
||||
existing,
|
||||
mock_embed_fn,
|
||||
mock_llm_fn,
|
||||
config,
|
||||
)
|
||||
assert result.action == "match"
|
||||
assert result.matched_entity == "dupilumab"
|
||||
assert result.method == "llm"
|
||||
|
||||
|
||||
# Edge cases
|
||||
class TestEdgeCases:
|
||||
@pytest.mark.asyncio
|
||||
async def test_empty_existing_entities(self, config):
|
||||
"""New entity when no existing entities."""
|
||||
result = await resolve_entity(
|
||||
"NewEntity",
|
||||
[],
|
||||
mock_embed_fn,
|
||||
mock_llm_fn,
|
||||
config,
|
||||
)
|
||||
assert result.action == "new"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_disabled_resolution(self, existing_entities):
|
||||
"""Resolution disabled returns new."""
|
||||
config = EntityResolutionConfig(enabled=False)
|
||||
result = await resolve_entity(
|
||||
"Dupixent",
|
||||
existing_entities,
|
||||
mock_embed_fn,
|
||||
mock_llm_fn,
|
||||
config,
|
||||
)
|
||||
assert result.action == "new"
|
||||
assert result.method == "disabled"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_genuinely_new_entity(self, existing_entities, config):
|
||||
"""Completely new entity should return 'new'."""
|
||||
result = await resolve_entity(
|
||||
"CompletelyNewDrug",
|
||||
existing_entities,
|
||||
mock_embed_fn,
|
||||
mock_llm_fn,
|
||||
config,
|
||||
)
|
||||
assert result.action == "new"
|
||||
assert result.method == "none"
|
||||
Loading…
Add table
Reference in a new issue