feat: add automatic entity resolution with 3-layer matching
Implement automatic entity resolution to prevent duplicate nodes in the knowledge graph. The system uses a 3-layer approach: 1. Case-insensitive exact matching (free, instant) 2. Fuzzy string matching >85% threshold (free, instant) 3. Vector similarity + LLM verification (for acronyms/synonyms) Key features: - Pre-resolution phase prevents race conditions in parallel processing - Numeric suffix detection blocks false matches (IL-4 ≠ IL-13) - PostgreSQL alias cache for fast lookups on subsequent ingestion - Configurable thresholds via environment variables Bug fixes included: - Fix fuzzy matching false positives for numbered entities - Fix alias cache not being populated (missing db parameter) - Skip entity_aliases table from generic id index creation New files: - lightrag/entity_resolution/ - Core resolution module - tests/test_entity_resolution/ - Unit tests - docker/postgres-age-vector/ - Custom PG image with pgvector + AGE - docker-compose.test.yml - Integration test environment Configuration (env.example): - ENTITY_RESOLUTION_ENABLED=true - ENTITY_RESOLUTION_FUZZY_THRESHOLD=0.85 - ENTITY_RESOLUTION_VECTOR_THRESHOLD=0.5 - ENTITY_RESOLUTION_MAX_CANDIDATES=3
This commit is contained in:
parent
4f12fe121d
commit
48c7732edc
20 changed files with 1561 additions and 101 deletions
84
docker-compose.test.yml
Normal file
84
docker-compose.test.yml
Normal file
|
|
@ -0,0 +1,84 @@
|
||||||
|
name: lightrag-entity-resolution-test
|
||||||
|
|
||||||
|
services:
|
||||||
|
postgres:
|
||||||
|
container_name: lightrag-postgres
|
||||||
|
build:
|
||||||
|
context: ./docker/postgres-age-vector
|
||||||
|
dockerfile: Dockerfile
|
||||||
|
environment:
|
||||||
|
POSTGRES_DB: lightrag
|
||||||
|
POSTGRES_USER: lightrag
|
||||||
|
POSTGRES_PASSWORD: lightrag_pass
|
||||||
|
ports:
|
||||||
|
- "5433:5432" # Use 5433 to avoid conflict with agent-sdk postgres
|
||||||
|
volumes:
|
||||||
|
- pgdata_test:/var/lib/postgresql/data
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD-SHELL", "pg_isready -U lightrag -d lightrag"]
|
||||||
|
interval: 5s
|
||||||
|
timeout: 5s
|
||||||
|
retries: 5
|
||||||
|
|
||||||
|
lightrag:
|
||||||
|
container_name: lightrag-test
|
||||||
|
build:
|
||||||
|
context: .
|
||||||
|
dockerfile: Dockerfile
|
||||||
|
ports:
|
||||||
|
- "9622:9621" # Use 9622 to avoid conflict
|
||||||
|
volumes:
|
||||||
|
- ./data/rag_storage_test:/app/data/rag_storage
|
||||||
|
- ./data/inputs_test:/app/data/inputs
|
||||||
|
environment:
|
||||||
|
# Server
|
||||||
|
- HOST=0.0.0.0
|
||||||
|
- PORT=9621
|
||||||
|
- LOG_LEVEL=DEBUG
|
||||||
|
|
||||||
|
# LLM (OpenAI)
|
||||||
|
- LLM_BINDING=openai
|
||||||
|
- LLM_MODEL=gpt-4o-mini
|
||||||
|
- LLM_BINDING_HOST=https://api.openai.com/v1
|
||||||
|
- LLM_BINDING_API_KEY=${OPENAI_API_KEY}
|
||||||
|
|
||||||
|
# Embedding
|
||||||
|
- EMBEDDING_BINDING=openai
|
||||||
|
- EMBEDDING_MODEL=text-embedding-3-small
|
||||||
|
- EMBEDDING_DIM=1536
|
||||||
|
- EMBEDDING_BINDING_HOST=https://api.openai.com/v1
|
||||||
|
- EMBEDDING_BINDING_API_KEY=${OPENAI_API_KEY}
|
||||||
|
|
||||||
|
# Storage Configuration - Full PostgreSQL!
|
||||||
|
# Custom postgres image has pgvector + Apache AGE
|
||||||
|
- LIGHTRAG_KV_STORAGE=PGKVStorage
|
||||||
|
- LIGHTRAG_VECTOR_STORAGE=PGVectorStorage
|
||||||
|
- LIGHTRAG_GRAPH_STORAGE=PGGraphStorage
|
||||||
|
- LIGHTRAG_DOC_STATUS_STORAGE=PGDocStatusStorage
|
||||||
|
- POSTGRES_HOST=postgres
|
||||||
|
- POSTGRES_PORT=5432
|
||||||
|
- POSTGRES_USER=lightrag
|
||||||
|
- POSTGRES_PASSWORD=lightrag_pass
|
||||||
|
- POSTGRES_DATABASE=lightrag
|
||||||
|
|
||||||
|
# Entity Resolution - ENABLED!
|
||||||
|
- ENTITY_RESOLUTION_ENABLED=true
|
||||||
|
- ENTITY_RESOLUTION_FUZZY_THRESHOLD=0.85
|
||||||
|
- ENTITY_RESOLUTION_VECTOR_THRESHOLD=0.5
|
||||||
|
- ENTITY_RESOLUTION_MAX_CANDIDATES=3
|
||||||
|
|
||||||
|
# Processing
|
||||||
|
- MAX_ASYNC=4
|
||||||
|
- CHUNK_SIZE=1200
|
||||||
|
depends_on:
|
||||||
|
postgres:
|
||||||
|
condition: service_healthy
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD-SHELL", "curl -f http://localhost:9621/health || exit 1"]
|
||||||
|
interval: 10s
|
||||||
|
timeout: 5s
|
||||||
|
retries: 10
|
||||||
|
start_period: 30s
|
||||||
|
|
||||||
|
volumes:
|
||||||
|
pgdata_test:
|
||||||
26
docker/postgres-age-vector/Dockerfile
Normal file
26
docker/postgres-age-vector/Dockerfile
Normal file
|
|
@ -0,0 +1,26 @@
|
||||||
|
# Start from pgvector image (has vector extension pre-built correctly)
|
||||||
|
FROM pgvector/pgvector:pg17
|
||||||
|
|
||||||
|
# Install build dependencies for AGE
|
||||||
|
RUN apt-get update && apt-get install -y \
|
||||||
|
build-essential \
|
||||||
|
git \
|
||||||
|
postgresql-server-dev-17 \
|
||||||
|
libreadline-dev \
|
||||||
|
zlib1g-dev \
|
||||||
|
flex \
|
||||||
|
bison \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# Install Apache AGE 1.6.0 for PG17
|
||||||
|
RUN cd /tmp \
|
||||||
|
&& git clone --branch release/PG17/1.6.0 https://github.com/apache/age.git \
|
||||||
|
&& cd age \
|
||||||
|
&& make \
|
||||||
|
&& make install \
|
||||||
|
&& rm -rf /tmp/age
|
||||||
|
|
||||||
|
# Add initialization script to create extensions
|
||||||
|
RUN echo "CREATE EXTENSION IF NOT EXISTS vector;" > /docker-entrypoint-initdb.d/01-vector.sql \
|
||||||
|
&& echo "CREATE EXTENSION IF NOT EXISTS age;" > /docker-entrypoint-initdb.d/02-age.sql \
|
||||||
|
&& echo "SET search_path = ag_catalog, public;" >> /docker-entrypoint-initdb.d/02-age.sql
|
||||||
10
env.example
10
env.example
|
|
@ -127,6 +127,16 @@ SUMMARY_LANGUAGE=English
|
||||||
### Entity types that the LLM will attempt to recognize
|
### Entity types that the LLM will attempt to recognize
|
||||||
# ENTITY_TYPES='["Person", "Creature", "Organization", "Location", "Event", "Concept", "Method", "Content", "Data", "Artifact", "NaturalObject"]'
|
# ENTITY_TYPES='["Person", "Creature", "Organization", "Location", "Event", "Concept", "Method", "Content", "Data", "Artifact", "NaturalObject"]'
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
### Entity Resolution Configuration
|
||||||
|
### Automatically deduplicates entities (e.g., "FDA" = "US Food and Drug Administration")
|
||||||
|
### Uses 3-layer approach: case normalization → fuzzy matching → LLM verification
|
||||||
|
###########################################################
|
||||||
|
# ENTITY_RESOLUTION_ENABLED=true
|
||||||
|
# ENTITY_RESOLUTION_FUZZY_THRESHOLD=0.85
|
||||||
|
# ENTITY_RESOLUTION_VECTOR_THRESHOLD=0.5
|
||||||
|
# ENTITY_RESOLUTION_MAX_CANDIDATES=3
|
||||||
|
|
||||||
### Chunk size for document splitting, 500~1500 is recommended
|
### Chunk size for document splitting, 500~1500 is recommended
|
||||||
# CHUNK_SIZE=1200
|
# CHUNK_SIZE=1200
|
||||||
# CHUNK_OVERLAP_SIZE=100
|
# CHUNK_OVERLAP_SIZE=100
|
||||||
|
|
|
||||||
93
examples/test_entity_resolution.py
Normal file
93
examples/test_entity_resolution.py
Normal file
|
|
@ -0,0 +1,93 @@
|
||||||
|
"""
|
||||||
|
Quick test for Entity Resolution feature.
|
||||||
|
|
||||||
|
Tests that:
|
||||||
|
1. "FDA" and "US Food and Drug Administration" resolve to the same entity
|
||||||
|
2. "Dupixant" (typo) matches "Dupixent" via fuzzy matching
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import os
|
||||||
|
import shutil
|
||||||
|
|
||||||
|
from lightrag import LightRAG
|
||||||
|
from lightrag.entity_resolution import EntityResolutionConfig
|
||||||
|
from lightrag.llm.openai import gpt_4o_mini_complete, openai_embed
|
||||||
|
from lightrag.utils import logger
|
||||||
|
import logging
|
||||||
|
|
||||||
|
WORKING_DIR = "./test_entity_resolution"
|
||||||
|
|
||||||
|
# Test document with entities that should be deduplicated
|
||||||
|
TEST_DOC = """
|
||||||
|
The FDA approved Dupixent for treating eczema in 2017.
|
||||||
|
The US Food and Drug Administration later expanded the drug's indications.
|
||||||
|
Dupixant (sometimes misspelled) has shown good results in clinical trials.
|
||||||
|
The FDA continues to monitor the safety of Dupixent.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
if not os.getenv("OPENAI_API_KEY"):
|
||||||
|
print("Error: Set OPENAI_API_KEY environment variable")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Clean up previous test
|
||||||
|
if os.path.exists(WORKING_DIR):
|
||||||
|
shutil.rmtree(WORKING_DIR)
|
||||||
|
os.makedirs(WORKING_DIR)
|
||||||
|
|
||||||
|
# Set up logging to see resolution messages
|
||||||
|
logging.basicConfig(level=logging.DEBUG)
|
||||||
|
logger.setLevel(logging.DEBUG)
|
||||||
|
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("Entity Resolution Test")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
rag = LightRAG(
|
||||||
|
working_dir=WORKING_DIR,
|
||||||
|
embedding_func=openai_embed,
|
||||||
|
llm_model_func=gpt_4o_mini_complete,
|
||||||
|
entity_resolution_config=EntityResolutionConfig(
|
||||||
|
enabled=True,
|
||||||
|
fuzzy_threshold=0.85,
|
||||||
|
vector_threshold=0.5,
|
||||||
|
max_candidates=3,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
await rag.initialize_storages()
|
||||||
|
|
||||||
|
print("\nInserting test document...")
|
||||||
|
print(f"Document: {TEST_DOC.strip()}")
|
||||||
|
print("\n" + "-" * 60)
|
||||||
|
|
||||||
|
await rag.ainsert(TEST_DOC)
|
||||||
|
|
||||||
|
print("\n" + "-" * 60)
|
||||||
|
print("Checking extracted entities...")
|
||||||
|
|
||||||
|
# Read the graph to see what entities were created
|
||||||
|
graph_file = os.path.join(WORKING_DIR, "graph_chunk_entity_relation.graphml")
|
||||||
|
if os.path.exists(graph_file):
|
||||||
|
import networkx as nx
|
||||||
|
|
||||||
|
G = nx.read_graphml(graph_file)
|
||||||
|
print(f"\nEntities in graph ({len(G.nodes())} total):")
|
||||||
|
for node in sorted(G.nodes()):
|
||||||
|
print(f" - {node}")
|
||||||
|
|
||||||
|
print(f"\nRelationships: {len(G.edges())}")
|
||||||
|
else:
|
||||||
|
print("Graph file not found")
|
||||||
|
|
||||||
|
await rag.finalize_storages()
|
||||||
|
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("Test complete!")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
||||||
|
|
@ -450,6 +450,20 @@ def parse_args() -> argparse.Namespace:
|
||||||
"EMBEDDING_TOKEN_LIMIT", None, int, special_none=True
|
"EMBEDDING_TOKEN_LIMIT", None, int, special_none=True
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Entity Resolution configuration
|
||||||
|
args.entity_resolution_enabled = get_env_value(
|
||||||
|
"ENTITY_RESOLUTION_ENABLED", False, bool
|
||||||
|
)
|
||||||
|
args.entity_resolution_fuzzy_threshold = get_env_value(
|
||||||
|
"ENTITY_RESOLUTION_FUZZY_THRESHOLD", 0.85, float
|
||||||
|
)
|
||||||
|
args.entity_resolution_vector_threshold = get_env_value(
|
||||||
|
"ENTITY_RESOLUTION_VECTOR_THRESHOLD", 0.5, float
|
||||||
|
)
|
||||||
|
args.entity_resolution_max_candidates = get_env_value(
|
||||||
|
"ENTITY_RESOLUTION_MAX_CANDIDATES", 3, int
|
||||||
|
)
|
||||||
|
|
||||||
ollama_server_infos.LIGHTRAG_NAME = args.simulated_model_name
|
ollama_server_infos.LIGHTRAG_NAME = args.simulated_model_name
|
||||||
ollama_server_infos.LIGHTRAG_TAG = args.simulated_model_tag
|
ollama_server_infos.LIGHTRAG_TAG = args.simulated_model_tag
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -2,6 +2,8 @@
|
||||||
LightRAG FastAPI Server
|
LightRAG FastAPI Server
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
from fastapi import FastAPI, Depends, HTTPException, Request
|
from fastapi import FastAPI, Depends, HTTPException, Request
|
||||||
from fastapi.exceptions import RequestValidationError
|
from fastapi.exceptions import RequestValidationError
|
||||||
from fastapi.responses import JSONResponse
|
from fastapi.responses import JSONResponse
|
||||||
|
|
@ -642,6 +644,23 @@ def create_app(args):
|
||||||
raise Exception(f"Failed to import {binding} options: {e}")
|
raise Exception(f"Failed to import {binding} options: {e}")
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
|
def create_entity_resolution_config(args) -> object | None:
|
||||||
|
"""
|
||||||
|
Create EntityResolutionConfig from command line/env arguments.
|
||||||
|
Returns None if entity resolution is disabled.
|
||||||
|
"""
|
||||||
|
if not args.entity_resolution_enabled:
|
||||||
|
return None
|
||||||
|
|
||||||
|
from lightrag.entity_resolution import EntityResolutionConfig
|
||||||
|
|
||||||
|
return EntityResolutionConfig(
|
||||||
|
enabled=True,
|
||||||
|
fuzzy_threshold=args.entity_resolution_fuzzy_threshold,
|
||||||
|
vector_threshold=args.entity_resolution_vector_threshold,
|
||||||
|
max_candidates=args.entity_resolution_max_candidates,
|
||||||
|
)
|
||||||
|
|
||||||
def create_optimized_embedding_function(
|
def create_optimized_embedding_function(
|
||||||
config_cache: LLMConfigCache, binding, model, host, api_key, args
|
config_cache: LLMConfigCache, binding, model, host, api_key, args
|
||||||
) -> EmbeddingFunc:
|
) -> EmbeddingFunc:
|
||||||
|
|
@ -1029,6 +1048,7 @@ def create_app(args):
|
||||||
"entity_types": args.entity_types,
|
"entity_types": args.entity_types,
|
||||||
},
|
},
|
||||||
ollama_server_infos=ollama_server_infos,
|
ollama_server_infos=ollama_server_infos,
|
||||||
|
entity_resolution_config=create_entity_resolution_config(args),
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Failed to initialize LightRAG: {e}")
|
logger.error(f"Failed to initialize LightRAG: {e}")
|
||||||
|
|
|
||||||
|
|
@ -443,7 +443,7 @@ class DocStatusResponse(BaseModel):
|
||||||
metadata: Optional[dict[str, Any]] = Field(
|
metadata: Optional[dict[str, Any]] = Field(
|
||||||
default=None, description="Additional metadata about the document"
|
default=None, description="Additional metadata about the document"
|
||||||
)
|
)
|
||||||
file_path: str = Field(description="Path to the document file")
|
file_path: Optional[str] = Field(default=None, description="Path to the document file")
|
||||||
|
|
||||||
class Config:
|
class Config:
|
||||||
json_schema_extra = {
|
json_schema_extra = {
|
||||||
|
|
|
||||||
29
lightrag/entity_resolution/__init__.py
Normal file
29
lightrag/entity_resolution/__init__.py
Normal file
|
|
@ -0,0 +1,29 @@
|
||||||
|
"""
|
||||||
|
Entity Resolution Module for LightRAG
|
||||||
|
|
||||||
|
Provides automatic entity deduplication using a 3-layer approach:
|
||||||
|
1. Case normalization (exact match)
|
||||||
|
2. Fuzzy string matching (typos)
|
||||||
|
3. Vector similarity + LLM verification (semantic matches)
|
||||||
|
"""
|
||||||
|
|
||||||
|
from .resolver import (
|
||||||
|
resolve_entity,
|
||||||
|
resolve_entity_with_vdb,
|
||||||
|
ResolutionResult,
|
||||||
|
get_cached_alias,
|
||||||
|
store_alias,
|
||||||
|
fuzzy_similarity,
|
||||||
|
)
|
||||||
|
from .config import EntityResolutionConfig, DEFAULT_CONFIG
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"resolve_entity",
|
||||||
|
"resolve_entity_with_vdb",
|
||||||
|
"ResolutionResult",
|
||||||
|
"EntityResolutionConfig",
|
||||||
|
"DEFAULT_CONFIG",
|
||||||
|
"get_cached_alias",
|
||||||
|
"store_alias",
|
||||||
|
"fuzzy_similarity",
|
||||||
|
]
|
||||||
57
lightrag/entity_resolution/config.py
Normal file
57
lightrag/entity_resolution/config.py
Normal file
|
|
@ -0,0 +1,57 @@
|
||||||
|
"""Configuration for Entity Resolution
|
||||||
|
|
||||||
|
Uses the same LLM that LightRAG is configured with - no separate model config needed.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class EntityResolutionConfig:
|
||||||
|
"""Configuration for the entity resolution system."""
|
||||||
|
|
||||||
|
# Whether entity resolution is enabled
|
||||||
|
enabled: bool = True
|
||||||
|
|
||||||
|
# Fuzzy pre-resolution: Enable/disable within-batch fuzzy matching before
|
||||||
|
# VDB lookup. When enabled, entities in the same batch are matched by string
|
||||||
|
# similarity alone. Set to False to skip fuzzy pre-resolution entirely (only
|
||||||
|
# exact case-insensitive matches will be accepted within batch; all other
|
||||||
|
# resolution goes to VDB/LLM). Disabling reduces false positives but may
|
||||||
|
# miss obvious typo corrections.
|
||||||
|
fuzzy_pre_resolution_enabled: bool = True
|
||||||
|
|
||||||
|
# Fuzzy string matching threshold (0-1)
|
||||||
|
# Above this = auto-match (catches typos like Dupixant/Dupixent at 0.88)
|
||||||
|
# Below this = continue to vector search
|
||||||
|
# Tuning advice:
|
||||||
|
# 0.90+ = Very conservative, near-identical strings (Dupixent/Dupixant)
|
||||||
|
# 0.85 = Balanced default, catches typos, avoids most false positives
|
||||||
|
# 0.80 = Aggressive, may merge distinct entities with similar names
|
||||||
|
# <0.75 = Not recommended, high false positive risk (Celebrex/Cerebyx=0.67)
|
||||||
|
# Test with your domain data; pharmaceutical names need higher thresholds.
|
||||||
|
fuzzy_threshold: float = 0.85
|
||||||
|
|
||||||
|
# Vector similarity threshold for finding candidates
|
||||||
|
# Low threshold = cast wide net, LLM will verify
|
||||||
|
# 0.5 catches FDA/US Food and Drug Administration at 0.67
|
||||||
|
vector_threshold: float = 0.5
|
||||||
|
|
||||||
|
# Maximum number of vector candidates to verify with LLM
|
||||||
|
# Limits cost - uses same LLM as LightRAG main config
|
||||||
|
max_candidates: int = 3
|
||||||
|
|
||||||
|
# LLM verification prompt template
|
||||||
|
llm_prompt_template: str = field(
|
||||||
|
default="""Are these two terms referring to the same entity?
|
||||||
|
Consider typos, misspellings, abbreviations, or alternate names.
|
||||||
|
|
||||||
|
Term A: {term_a}
|
||||||
|
Term B: {term_b}
|
||||||
|
|
||||||
|
Answer only YES or NO.""",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# Default configuration
|
||||||
|
DEFAULT_CONFIG = EntityResolutionConfig()
|
||||||
335
lightrag/entity_resolution/resolver.py
Normal file
335
lightrag/entity_resolution/resolver.py
Normal file
|
|
@ -0,0 +1,335 @@
|
||||||
|
"""Entity Resolution - 3-Layer Approach
|
||||||
|
|
||||||
|
Layer 1: Case normalization (exact match)
|
||||||
|
Layer 2: Fuzzy string matching (>85% = typos)
|
||||||
|
Layer 3: Vector similarity + LLM verification (semantic matches)
|
||||||
|
|
||||||
|
Uses the same LLM that LightRAG is configured with.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from collections.abc import Awaitable, Callable
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from difflib import SequenceMatcher
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from lightrag.utils import logger
|
||||||
|
from .config import DEFAULT_CONFIG, EntityResolutionConfig
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ResolutionResult:
|
||||||
|
"""Result of entity resolution attempt."""
|
||||||
|
|
||||||
|
action: str # "match" | "new"
|
||||||
|
matched_entity: str | None
|
||||||
|
confidence: float
|
||||||
|
method: str # "exact" | "fuzzy" | "llm" | "none" | "disabled"
|
||||||
|
|
||||||
|
|
||||||
|
def cosine_similarity(a: list[float], b: list[float]) -> float:
|
||||||
|
"""Calculate cosine similarity between two vectors."""
|
||||||
|
a_arr, b_arr = np.array(a), np.array(b)
|
||||||
|
norm_a, norm_b = np.linalg.norm(a_arr), np.linalg.norm(b_arr)
|
||||||
|
if norm_a == 0 or norm_b == 0:
|
||||||
|
return 0.0
|
||||||
|
return float(np.dot(a_arr, b_arr) / (norm_a * norm_b))
|
||||||
|
|
||||||
|
|
||||||
|
def fuzzy_similarity(a: str, b: str) -> float:
|
||||||
|
"""Calculate fuzzy string similarity (0-1)."""
|
||||||
|
return SequenceMatcher(None, a.lower().strip(), b.lower().strip()).ratio()
|
||||||
|
|
||||||
|
|
||||||
|
def find_vector_candidates(
|
||||||
|
query_embedding: list[float],
|
||||||
|
existing_entities: list[tuple[str, list[float]]],
|
||||||
|
threshold: float,
|
||||||
|
) -> list[tuple[str, float]]:
|
||||||
|
"""Find entities with vector similarity above threshold."""
|
||||||
|
candidates = []
|
||||||
|
for name, embedding in existing_entities:
|
||||||
|
sim = cosine_similarity(query_embedding, embedding)
|
||||||
|
if sim >= threshold:
|
||||||
|
candidates.append((name, sim))
|
||||||
|
# Sort by similarity descending
|
||||||
|
candidates.sort(key=lambda x: x[1], reverse=True)
|
||||||
|
return candidates
|
||||||
|
|
||||||
|
|
||||||
|
async def llm_verify(
|
||||||
|
term_a: str,
|
||||||
|
term_b: str,
|
||||||
|
llm_fn: Callable[[str], Awaitable[str]],
|
||||||
|
prompt_template: str,
|
||||||
|
) -> bool:
|
||||||
|
"""Ask LLM if two terms refer to the same entity.
|
||||||
|
|
||||||
|
Uses strict parsing with exact token matching only. Accepted responses:
|
||||||
|
- Positive: "YES", "TRUE", "SAME", "MATCH"
|
||||||
|
- Negative: "NO", "FALSE", "DIFFERENT", "NOT SAME"
|
||||||
|
Any other response defaults to False to avoid false positive merges.
|
||||||
|
"""
|
||||||
|
prompt = prompt_template.format(term_a=term_a, term_b=term_b)
|
||||||
|
response = await llm_fn(prompt)
|
||||||
|
|
||||||
|
# Normalize response: strip whitespace, take first line only
|
||||||
|
normalized = response.strip().split("\n")[0].strip().upper()
|
||||||
|
|
||||||
|
# Remove common trailing punctuation
|
||||||
|
normalized = normalized.rstrip(".!,")
|
||||||
|
|
||||||
|
# Only accept exact tokens (no prefix/substring matching)
|
||||||
|
if normalized in ("YES", "TRUE", "SAME", "MATCH"):
|
||||||
|
return True
|
||||||
|
if normalized in ("NO", "FALSE", "DIFFERENT", "NOT SAME"):
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Default to False for ambiguous responses (safer than false positive)
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
async def resolve_entity(
|
||||||
|
entity_name: str,
|
||||||
|
existing_entities: list[tuple[str, list[float]]],
|
||||||
|
embed_fn: Callable[[str], Awaitable[list[float]]],
|
||||||
|
llm_fn: Callable[[str], Awaitable[str]],
|
||||||
|
config: EntityResolutionConfig = DEFAULT_CONFIG,
|
||||||
|
) -> ResolutionResult:
|
||||||
|
"""Resolve an entity against existing entities using 3-layer approach.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
entity_name: The new entity name to resolve
|
||||||
|
existing_entities: List of (name, embedding) tuples for existing entities
|
||||||
|
embed_fn: Async function to get embedding for a string (same as LightRAG uses)
|
||||||
|
llm_fn: Async function to query LLM (same as LightRAG uses)
|
||||||
|
config: Resolution configuration
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
ResolutionResult with action ("match" or "new"), matched entity,
|
||||||
|
confidence, and method used.
|
||||||
|
"""
|
||||||
|
if not config.enabled:
|
||||||
|
return ResolutionResult("new", None, 0.0, "disabled")
|
||||||
|
|
||||||
|
if not existing_entities:
|
||||||
|
return ResolutionResult("new", None, 0.0, "none")
|
||||||
|
|
||||||
|
normalized = entity_name.lower().strip()
|
||||||
|
|
||||||
|
# Layer 1: Case-insensitive exact match
|
||||||
|
for name, _ in existing_entities:
|
||||||
|
if name.lower().strip() == normalized:
|
||||||
|
return ResolutionResult("match", name, 1.0, "exact")
|
||||||
|
|
||||||
|
# Layer 2: Fuzzy string matching (catches typos)
|
||||||
|
best_fuzzy_match = None
|
||||||
|
best_fuzzy_score = 0.0
|
||||||
|
|
||||||
|
for name, _ in existing_entities:
|
||||||
|
similarity = fuzzy_similarity(entity_name, name)
|
||||||
|
if similarity > best_fuzzy_score:
|
||||||
|
best_fuzzy_score = similarity
|
||||||
|
best_fuzzy_match = name
|
||||||
|
|
||||||
|
if best_fuzzy_score >= config.fuzzy_threshold:
|
||||||
|
return ResolutionResult("match", best_fuzzy_match, best_fuzzy_score, "fuzzy")
|
||||||
|
|
||||||
|
# Layer 3: Vector similarity + LLM verification
|
||||||
|
embedding = await embed_fn(entity_name)
|
||||||
|
candidates = find_vector_candidates(
|
||||||
|
embedding,
|
||||||
|
existing_entities,
|
||||||
|
config.vector_threshold,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Verify top candidates with LLM
|
||||||
|
for candidate_name, similarity in candidates[: config.max_candidates]:
|
||||||
|
is_same = await llm_verify(
|
||||||
|
entity_name,
|
||||||
|
candidate_name,
|
||||||
|
llm_fn,
|
||||||
|
config.llm_prompt_template,
|
||||||
|
)
|
||||||
|
if is_same:
|
||||||
|
return ResolutionResult("match", candidate_name, similarity, "llm")
|
||||||
|
|
||||||
|
# No match found - this is a new entity
|
||||||
|
return ResolutionResult("new", None, 0.0, "none")
|
||||||
|
|
||||||
|
|
||||||
|
async def resolve_entity_with_vdb(
|
||||||
|
entity_name: str,
|
||||||
|
entity_vdb, # BaseVectorStorage - imported dynamically to avoid circular imports
|
||||||
|
llm_fn: Callable[[str], Awaitable[str]],
|
||||||
|
config: EntityResolutionConfig = DEFAULT_CONFIG,
|
||||||
|
) -> ResolutionResult:
|
||||||
|
"""Resolve an entity using VDB for similarity search.
|
||||||
|
|
||||||
|
This is the production integration that uses LightRAG's vector database
|
||||||
|
directly instead of requiring pre-computed embeddings.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
entity_name: The new entity name to resolve
|
||||||
|
entity_vdb: LightRAG's entity vector database (BaseVectorStorage)
|
||||||
|
llm_fn: Async function to query LLM (same as LightRAG uses)
|
||||||
|
config: Resolution configuration
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
ResolutionResult with action ("match" or "new"), matched entity,
|
||||||
|
confidence, and method used.
|
||||||
|
"""
|
||||||
|
if not config.enabled:
|
||||||
|
return ResolutionResult("new", None, 0.0, "disabled")
|
||||||
|
|
||||||
|
if entity_vdb is None:
|
||||||
|
return ResolutionResult("new", None, 0.0, "none")
|
||||||
|
|
||||||
|
normalized = entity_name.lower().strip()
|
||||||
|
|
||||||
|
# Query VDB for similar entities - cast wide net, LLM will verify
|
||||||
|
# top_k is doubled to have enough candidates after filtering
|
||||||
|
try:
|
||||||
|
candidates = await entity_vdb.query(
|
||||||
|
entity_name, top_k=config.max_candidates * 3
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
# Log and skip resolution if VDB query fails
|
||||||
|
logger.debug(f"VDB query failed for '{entity_name}': {e}")
|
||||||
|
return ResolutionResult("new", None, 0.0, "none")
|
||||||
|
|
||||||
|
if not candidates:
|
||||||
|
return ResolutionResult("new", None, 0.0, "none")
|
||||||
|
|
||||||
|
# Layer 1: Case-insensitive exact match among candidates
|
||||||
|
for candidate in candidates:
|
||||||
|
candidate_name = candidate.get("entity_name")
|
||||||
|
if candidate_name and candidate_name.lower().strip() == normalized:
|
||||||
|
return ResolutionResult("match", candidate_name, 1.0, "exact")
|
||||||
|
|
||||||
|
# Layer 2: Fuzzy string matching (catches typos)
|
||||||
|
best_fuzzy_match = None
|
||||||
|
best_fuzzy_score = 0.0
|
||||||
|
|
||||||
|
for candidate in candidates:
|
||||||
|
candidate_name = candidate.get("entity_name")
|
||||||
|
if not candidate_name:
|
||||||
|
continue
|
||||||
|
similarity = fuzzy_similarity(entity_name, candidate_name)
|
||||||
|
if similarity > best_fuzzy_score:
|
||||||
|
best_fuzzy_score = similarity
|
||||||
|
best_fuzzy_match = candidate_name
|
||||||
|
|
||||||
|
if best_fuzzy_score >= config.fuzzy_threshold:
|
||||||
|
return ResolutionResult("match", best_fuzzy_match, best_fuzzy_score, "fuzzy")
|
||||||
|
|
||||||
|
# Layer 3: LLM verification on top candidates
|
||||||
|
verified_count = 0
|
||||||
|
for candidate in candidates:
|
||||||
|
if verified_count >= config.max_candidates:
|
||||||
|
break
|
||||||
|
candidate_name = candidate.get("entity_name")
|
||||||
|
if not candidate_name:
|
||||||
|
continue
|
||||||
|
|
||||||
|
is_same = await llm_verify(
|
||||||
|
entity_name,
|
||||||
|
candidate_name,
|
||||||
|
llm_fn,
|
||||||
|
config.llm_prompt_template,
|
||||||
|
)
|
||||||
|
verified_count += 1
|
||||||
|
|
||||||
|
if is_same:
|
||||||
|
# Use distance from VDB if available (converted to similarity)
|
||||||
|
similarity = 0.7 # Default confidence for LLM match
|
||||||
|
return ResolutionResult("match", candidate_name, similarity, "llm")
|
||||||
|
|
||||||
|
# No match found - this is a new entity
|
||||||
|
return ResolutionResult("new", None, 0.0, "none")
|
||||||
|
|
||||||
|
|
||||||
|
# --- Alias Cache Functions (PostgreSQL) ---
|
||||||
|
|
||||||
|
|
||||||
|
async def get_cached_alias(
|
||||||
|
alias: str,
|
||||||
|
db, # PostgresDB instance
|
||||||
|
workspace: str,
|
||||||
|
) -> tuple[str, str, float] | None:
|
||||||
|
"""Check if alias is already resolved in cache.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
alias: The entity name to look up
|
||||||
|
db: PostgresDB instance with query method
|
||||||
|
workspace: Workspace for isolation
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple of (canonical_entity, method, confidence) if found, None otherwise
|
||||||
|
"""
|
||||||
|
import logging
|
||||||
|
|
||||||
|
from lightrag.kg.postgres_impl import SQL_TEMPLATES
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
normalized_alias = alias.lower().strip()
|
||||||
|
|
||||||
|
sql = SQL_TEMPLATES["get_alias"]
|
||||||
|
try:
|
||||||
|
result = await db.query(sql, params=[workspace, normalized_alias])
|
||||||
|
if result:
|
||||||
|
return (
|
||||||
|
result["canonical_entity"],
|
||||||
|
result["method"],
|
||||||
|
result["confidence"],
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug(f"Alias cache lookup error: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
async def store_alias(
|
||||||
|
alias: str,
|
||||||
|
canonical: str,
|
||||||
|
method: str,
|
||||||
|
confidence: float,
|
||||||
|
db, # PostgresDB instance
|
||||||
|
workspace: str,
|
||||||
|
) -> None:
|
||||||
|
"""Store a resolution in the alias cache.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
alias: The variant name (e.g., "FDA")
|
||||||
|
canonical: The resolved canonical name (e.g., "US Food and Drug Administration")
|
||||||
|
method: How it was resolved ('exact', 'fuzzy', 'llm', 'manual')
|
||||||
|
confidence: Resolution confidence (0-1)
|
||||||
|
db: PostgresDB instance with execute method
|
||||||
|
workspace: Workspace for isolation
|
||||||
|
"""
|
||||||
|
import logging
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
|
||||||
|
from lightrag.kg.postgres_impl import SQL_TEMPLATES
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
normalized_alias = alias.lower().strip()
|
||||||
|
|
||||||
|
# Don't store self-referential aliases (e.g., "FDA" → "FDA")
|
||||||
|
if normalized_alias == canonical.lower().strip():
|
||||||
|
return
|
||||||
|
|
||||||
|
sql = SQL_TEMPLATES["upsert_alias"]
|
||||||
|
try:
|
||||||
|
await db.execute(
|
||||||
|
sql,
|
||||||
|
data={
|
||||||
|
"workspace": workspace,
|
||||||
|
"alias": normalized_alias,
|
||||||
|
"canonical_entity": canonical,
|
||||||
|
"method": method,
|
||||||
|
"confidence": confidence,
|
||||||
|
"create_time": datetime.now(timezone.utc).replace(tzinfo=None),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug(f"Alias cache store error: {e}")
|
||||||
|
|
@ -1130,7 +1130,14 @@ class PostgreSQLDB:
|
||||||
existing_indexes = {row["indexname"] for row in existing_indexes_result}
|
existing_indexes = {row["indexname"] for row in existing_indexes_result}
|
||||||
|
|
||||||
# Create missing indexes
|
# Create missing indexes
|
||||||
|
# Tables that don't have an 'id' column (use different primary key structure)
|
||||||
|
tables_without_id = {"LIGHTRAG_ENTITY_ALIASES"}
|
||||||
|
|
||||||
for k in table_names:
|
for k in table_names:
|
||||||
|
# Skip tables that don't have an 'id' column
|
||||||
|
if k in tables_without_id:
|
||||||
|
continue
|
||||||
|
|
||||||
# Create index for id column if missing
|
# Create index for id column if missing
|
||||||
index_name = f"idx_{k.lower()}_id"
|
index_name = f"idx_{k.lower()}_id"
|
||||||
if index_name not in existing_indexes:
|
if index_name not in existing_indexes:
|
||||||
|
|
@ -1259,6 +1266,12 @@ class PostgreSQLDB:
|
||||||
f"PostgreSQL, Failed to create full entities/relations tables: {e}"
|
f"PostgreSQL, Failed to create full entities/relations tables: {e}"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Migrate entity aliases table to add update_time, index on canonical_entity, and confidence constraint
|
||||||
|
try:
|
||||||
|
await self._migrate_entity_aliases_schema()
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"PostgreSQL, Failed to migrate entity aliases schema: {e}")
|
||||||
|
|
||||||
async def _migrate_create_full_entities_relations_tables(self):
|
async def _migrate_create_full_entities_relations_tables(self):
|
||||||
"""Create LIGHTRAG_FULL_ENTITIES and LIGHTRAG_FULL_RELATIONS tables if they don't exist"""
|
"""Create LIGHTRAG_FULL_ENTITIES and LIGHTRAG_FULL_RELATIONS tables if they don't exist"""
|
||||||
tables_to_check = [
|
tables_to_check = [
|
||||||
|
|
@ -1323,6 +1336,127 @@ class PostgreSQLDB:
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Failed to create table {table_name}: {e}")
|
logger.error(f"Failed to create table {table_name}: {e}")
|
||||||
|
|
||||||
|
async def _migrate_entity_aliases_schema(self):
|
||||||
|
"""Migrate LIGHTRAG_ENTITY_ALIASES table to add update_time column, canonical index, and confidence constraint"""
|
||||||
|
table_name = "LIGHTRAG_ENTITY_ALIASES"
|
||||||
|
|
||||||
|
# Check if table exists first
|
||||||
|
check_table_sql = """
|
||||||
|
SELECT table_name
|
||||||
|
FROM information_schema.tables
|
||||||
|
WHERE table_name = $1
|
||||||
|
AND table_schema = 'public'
|
||||||
|
"""
|
||||||
|
table_exists = await self.query(check_table_sql, [table_name.lower()])
|
||||||
|
if not table_exists:
|
||||||
|
logger.debug(f"Table {table_name} does not exist yet, skipping migration")
|
||||||
|
return
|
||||||
|
|
||||||
|
# 1. Add update_time column if it doesn't exist
|
||||||
|
check_column_sql = """
|
||||||
|
SELECT column_name
|
||||||
|
FROM information_schema.columns
|
||||||
|
WHERE table_name = $1
|
||||||
|
AND column_name = 'update_time'
|
||||||
|
AND table_schema = 'public'
|
||||||
|
"""
|
||||||
|
column_exists = await self.query(check_column_sql, [table_name.lower()])
|
||||||
|
|
||||||
|
if not column_exists:
|
||||||
|
try:
|
||||||
|
# Three-step migration to add update_time column:
|
||||||
|
# 1. Add column WITHOUT default - avoids full table rewrite on large tables
|
||||||
|
# 2. Backfill existing rows with create_time values
|
||||||
|
# 3. Set DEFAULT for future inserts
|
||||||
|
# Note: There's a tiny race window between steps 1-3 where concurrent
|
||||||
|
# inserts could get NULL. This is acceptable for this migration use case.
|
||||||
|
#
|
||||||
|
# Step 1: Add column WITHOUT default (existing rows get NULL)
|
||||||
|
add_column_sql = f"""
|
||||||
|
ALTER TABLE {table_name}
|
||||||
|
ADD COLUMN update_time TIMESTAMP(0)
|
||||||
|
"""
|
||||||
|
await self.execute(add_column_sql)
|
||||||
|
logger.info(f"PostgreSQL, Added update_time column to {table_name}")
|
||||||
|
|
||||||
|
# Step 2: Set existing rows' update_time to their create_time
|
||||||
|
update_sql = f"""
|
||||||
|
UPDATE {table_name}
|
||||||
|
SET update_time = create_time
|
||||||
|
WHERE update_time IS NULL
|
||||||
|
"""
|
||||||
|
await self.execute(update_sql)
|
||||||
|
logger.info(
|
||||||
|
f"PostgreSQL, Initialized update_time values in {table_name}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Step 3: Set default for future rows
|
||||||
|
set_default_sql = f"""
|
||||||
|
ALTER TABLE {table_name}
|
||||||
|
ALTER COLUMN update_time SET DEFAULT CURRENT_TIMESTAMP
|
||||||
|
"""
|
||||||
|
await self.execute(set_default_sql)
|
||||||
|
logger.info(
|
||||||
|
f"PostgreSQL, Set default for update_time column in {table_name}"
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(
|
||||||
|
f"PostgreSQL, Failed to add update_time column to {table_name}: {e}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# 2. Create index on (workspace, canonical_entity) for get_aliases_for_canonical query
|
||||||
|
index_name = "idx_lightrag_entity_aliases_canonical"
|
||||||
|
check_index_sql = """
|
||||||
|
SELECT indexname
|
||||||
|
FROM pg_indexes
|
||||||
|
WHERE tablename = $1
|
||||||
|
AND indexname = $2
|
||||||
|
"""
|
||||||
|
index_exists = await self.query(
|
||||||
|
check_index_sql, [table_name.lower(), index_name]
|
||||||
|
)
|
||||||
|
|
||||||
|
if not index_exists:
|
||||||
|
try:
|
||||||
|
create_index_sql = f"""
|
||||||
|
CREATE INDEX {index_name}
|
||||||
|
ON {table_name} (workspace, canonical_entity)
|
||||||
|
"""
|
||||||
|
await self.execute(create_index_sql)
|
||||||
|
logger.info(f"PostgreSQL, Created index {index_name} on {table_name}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"PostgreSQL, Failed to create index {index_name}: {e}")
|
||||||
|
|
||||||
|
# 3. Add CHECK constraint for confidence range if it doesn't exist
|
||||||
|
constraint_name = "confidence_range"
|
||||||
|
check_constraint_sql = """
|
||||||
|
SELECT constraint_name
|
||||||
|
FROM information_schema.table_constraints
|
||||||
|
WHERE table_name = $1
|
||||||
|
AND constraint_name = $2
|
||||||
|
AND constraint_type = 'CHECK'
|
||||||
|
AND table_schema = 'public'
|
||||||
|
"""
|
||||||
|
constraint_exists = await self.query(
|
||||||
|
check_constraint_sql, [table_name.lower(), constraint_name]
|
||||||
|
)
|
||||||
|
|
||||||
|
if not constraint_exists:
|
||||||
|
try:
|
||||||
|
add_constraint_sql = f"""
|
||||||
|
ALTER TABLE {table_name}
|
||||||
|
ADD CONSTRAINT {constraint_name}
|
||||||
|
CHECK (confidence >= 0 AND confidence <= 1)
|
||||||
|
"""
|
||||||
|
await self.execute(add_constraint_sql)
|
||||||
|
logger.info(
|
||||||
|
f"PostgreSQL, Added CHECK constraint {constraint_name} to {table_name}"
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(
|
||||||
|
f"PostgreSQL, Failed to add CHECK constraint {constraint_name} to {table_name}: {e}"
|
||||||
|
)
|
||||||
|
|
||||||
async def _create_pagination_indexes(self):
|
async def _create_pagination_indexes(self):
|
||||||
"""Create indexes to optimize pagination queries for LIGHTRAG_DOC_STATUS"""
|
"""Create indexes to optimize pagination queries for LIGHTRAG_DOC_STATUS"""
|
||||||
indexes = [
|
indexes = [
|
||||||
|
|
@ -1402,7 +1536,7 @@ class PostgreSQLDB:
|
||||||
"VCHORDRQ": f"""
|
"VCHORDRQ": f"""
|
||||||
CREATE INDEX {{vector_index_name}}
|
CREATE INDEX {{vector_index_name}}
|
||||||
ON {{k}} USING vchordrq (content_vector vector_cosine_ops)
|
ON {{k}} USING vchordrq (content_vector vector_cosine_ops)
|
||||||
{f'WITH (options = $${self.vchordrq_build_options}$$)' if self.vchordrq_build_options else ''}
|
{f"WITH (options = $${self.vchordrq_build_options}$$)" if self.vchordrq_build_options else ""}
|
||||||
""",
|
""",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -4906,6 +5040,19 @@ TABLES = {
|
||||||
CONSTRAINT LIGHTRAG_RELATION_CHUNKS_PK PRIMARY KEY (workspace, id)
|
CONSTRAINT LIGHTRAG_RELATION_CHUNKS_PK PRIMARY KEY (workspace, id)
|
||||||
)"""
|
)"""
|
||||||
},
|
},
|
||||||
|
"LIGHTRAG_ENTITY_ALIASES": {
|
||||||
|
"ddl": """CREATE TABLE LIGHTRAG_ENTITY_ALIASES (
|
||||||
|
workspace VARCHAR(255),
|
||||||
|
alias VARCHAR(512),
|
||||||
|
canonical_entity VARCHAR(512),
|
||||||
|
method VARCHAR(50),
|
||||||
|
confidence FLOAT,
|
||||||
|
create_time TIMESTAMP(0) DEFAULT CURRENT_TIMESTAMP,
|
||||||
|
update_time TIMESTAMP(0) DEFAULT CURRENT_TIMESTAMP,
|
||||||
|
CONSTRAINT LIGHTRAG_ENTITY_ALIASES_PK PRIMARY KEY (workspace, alias),
|
||||||
|
CONSTRAINT confidence_range CHECK (confidence >= 0 AND confidence <= 1)
|
||||||
|
)"""
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -5117,4 +5264,25 @@ SQL_TEMPLATES = {
|
||||||
"drop_specifiy_table_workspace": """
|
"drop_specifiy_table_workspace": """
|
||||||
DELETE FROM {table_name} WHERE workspace=$1
|
DELETE FROM {table_name} WHERE workspace=$1
|
||||||
""",
|
""",
|
||||||
|
# Entity alias cache
|
||||||
|
"get_alias": """
|
||||||
|
SELECT canonical_entity, method, confidence
|
||||||
|
FROM LIGHTRAG_ENTITY_ALIASES
|
||||||
|
WHERE workspace=$1 AND alias=$2
|
||||||
|
""",
|
||||||
|
"upsert_alias": """
|
||||||
|
INSERT INTO LIGHTRAG_ENTITY_ALIASES
|
||||||
|
(workspace, alias, canonical_entity, method, confidence, create_time, update_time)
|
||||||
|
VALUES ($1, $2, $3, $4, $5, $6, $6)
|
||||||
|
ON CONFLICT (workspace, alias) DO UPDATE SET
|
||||||
|
canonical_entity = EXCLUDED.canonical_entity,
|
||||||
|
method = EXCLUDED.method,
|
||||||
|
confidence = EXCLUDED.confidence,
|
||||||
|
update_time = CURRENT_TIMESTAMP
|
||||||
|
""",
|
||||||
|
"get_aliases_for_canonical": """
|
||||||
|
SELECT alias, method, confidence
|
||||||
|
FROM LIGHTRAG_ENTITY_ALIASES
|
||||||
|
WHERE workspace=$1 AND canonical_entity=$2
|
||||||
|
""",
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -26,6 +26,7 @@ from typing import (
|
||||||
)
|
)
|
||||||
from lightrag.prompt import PROMPTS
|
from lightrag.prompt import PROMPTS
|
||||||
from lightrag.exceptions import PipelineCancelledException
|
from lightrag.exceptions import PipelineCancelledException
|
||||||
|
from lightrag.entity_resolution import EntityResolutionConfig
|
||||||
from lightrag.constants import (
|
from lightrag.constants import (
|
||||||
DEFAULT_MAX_GLEANING,
|
DEFAULT_MAX_GLEANING,
|
||||||
DEFAULT_FORCE_LLM_SUMMARY_ON_MERGE,
|
DEFAULT_FORCE_LLM_SUMMARY_ON_MERGE,
|
||||||
|
|
@ -217,6 +218,11 @@ class LightRAG:
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
entity_resolution_config: EntityResolutionConfig | None = field(default=None)
|
||||||
|
"""Configuration for entity resolution (deduplication).
|
||||||
|
Set to EntityResolutionConfig() to enable, or None to disable.
|
||||||
|
Resolves entities like 'FDA' → 'US Food and Drug Administration'."""
|
||||||
|
|
||||||
# Text chunking
|
# Text chunking
|
||||||
# ---
|
# ---
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -5,6 +5,7 @@ from pathlib import Path
|
||||||
import asyncio
|
import asyncio
|
||||||
import json
|
import json
|
||||||
import json_repair
|
import json_repair
|
||||||
|
import re
|
||||||
from typing import Any, AsyncIterator, overload, Literal
|
from typing import Any, AsyncIterator, overload, Literal
|
||||||
from collections import Counter, defaultdict
|
from collections import Counter, defaultdict
|
||||||
|
|
||||||
|
|
@ -50,6 +51,13 @@ from lightrag.base import (
|
||||||
QueryContextResult,
|
QueryContextResult,
|
||||||
)
|
)
|
||||||
from lightrag.prompt import PROMPTS
|
from lightrag.prompt import PROMPTS
|
||||||
|
from lightrag.entity_resolution import (
|
||||||
|
resolve_entity_with_vdb,
|
||||||
|
get_cached_alias,
|
||||||
|
store_alias,
|
||||||
|
fuzzy_similarity,
|
||||||
|
EntityResolutionConfig,
|
||||||
|
)
|
||||||
from lightrag.constants import (
|
from lightrag.constants import (
|
||||||
GRAPH_FIELD_SEP,
|
GRAPH_FIELD_SEP,
|
||||||
DEFAULT_MAX_ENTITY_TOKENS,
|
DEFAULT_MAX_ENTITY_TOKENS,
|
||||||
|
|
@ -1590,6 +1598,180 @@ async def _rebuild_single_relationship(
|
||||||
pipeline_status["history_messages"].append(status_message)
|
pipeline_status["history_messages"].append(status_message)
|
||||||
|
|
||||||
|
|
||||||
|
def _has_different_numeric_suffix(name_a: str, name_b: str) -> bool:
|
||||||
|
"""Check if two names have different numeric components.
|
||||||
|
|
||||||
|
This prevents false fuzzy matches between entities that differ only by number,
|
||||||
|
such as "Interleukin-4" vs "Interleukin-13" (88.9% similar but semantically distinct).
|
||||||
|
|
||||||
|
Scientific/medical entities often use numbers as key identifiers:
|
||||||
|
- Interleukins: IL-4, IL-13, IL-17
|
||||||
|
- Drug phases: Phase 1, Phase 2, Phase 3
|
||||||
|
- Receptor types: Type 1, Type 2
|
||||||
|
- Versions: v1.0, v2.0
|
||||||
|
|
||||||
|
Args:
|
||||||
|
name_a: First entity name
|
||||||
|
name_b: Second entity name
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if both names contain numbers but the numbers differ, False otherwise.
|
||||||
|
"""
|
||||||
|
# Extract all numeric patterns (integers and decimals)
|
||||||
|
pattern = r"(\d+(?:\.\d+)?)"
|
||||||
|
nums_a = re.findall(pattern, name_a)
|
||||||
|
nums_b = re.findall(pattern, name_b)
|
||||||
|
|
||||||
|
# If both have numbers and they differ, these are likely distinct entities
|
||||||
|
if nums_a and nums_b and nums_a != nums_b:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
async def _build_pre_resolution_map(
|
||||||
|
entity_names: list[str],
|
||||||
|
entity_types: dict[str, str],
|
||||||
|
entity_vdb,
|
||||||
|
llm_fn,
|
||||||
|
config: EntityResolutionConfig,
|
||||||
|
) -> tuple[dict[str, str], dict[str, float]]:
|
||||||
|
"""Build resolution map before parallel processing to prevent race conditions.
|
||||||
|
|
||||||
|
This function resolves entities against each other within the batch (using
|
||||||
|
instant fuzzy matching) and against existing VDB entries. The resulting map
|
||||||
|
is applied during parallel entity processing.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
entity_names: List of entity names to resolve
|
||||||
|
entity_types: Dict mapping entity names to their types (e.g., "person", "organization").
|
||||||
|
Used to prevent fuzzy matching between entities of different types.
|
||||||
|
entity_vdb: Entity vector database for checking existing entities
|
||||||
|
llm_fn: LLM function for semantic verification
|
||||||
|
config: Entity resolution configuration
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple of:
|
||||||
|
- resolution_map: Dict mapping original entity names to their resolved canonical names.
|
||||||
|
Only entities that need remapping are included.
|
||||||
|
- confidence_map: Dict mapping alias to confidence score (1.0 for exact, actual
|
||||||
|
similarity for fuzzy, result.confidence for VDB matches).
|
||||||
|
"""
|
||||||
|
resolution_map: dict[str, str] = {}
|
||||||
|
confidence_map: dict[str, float] = {}
|
||||||
|
# Track canonical entities with their types: [(name, type), ...]
|
||||||
|
canonical_entities: list[tuple[str, str]] = []
|
||||||
|
|
||||||
|
for entity_name in entity_names:
|
||||||
|
normalized = entity_name.lower().strip()
|
||||||
|
entity_type = entity_types.get(entity_name, "")
|
||||||
|
|
||||||
|
# Skip if already resolved to something in this batch
|
||||||
|
if entity_name in resolution_map:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Layer 1: Case-insensitive exact match within batch
|
||||||
|
matched = False
|
||||||
|
for canonical, canonical_type in canonical_entities:
|
||||||
|
if canonical.lower().strip() == normalized:
|
||||||
|
resolution_map[entity_name] = canonical
|
||||||
|
confidence_map[entity_name] = 1.0 # Exact match = perfect confidence
|
||||||
|
logger.debug(
|
||||||
|
f"Pre-resolution (case match): '{entity_name}' → '{canonical}'"
|
||||||
|
)
|
||||||
|
matched = True
|
||||||
|
break
|
||||||
|
|
||||||
|
if matched:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Layer 2: Fuzzy match within batch (catches typos like Dupixant→Dupixent)
|
||||||
|
# Only enabled when config.fuzzy_pre_resolution_enabled is True.
|
||||||
|
# Requires: similarity >= threshold AND matching types (or unknown).
|
||||||
|
if config.fuzzy_pre_resolution_enabled:
|
||||||
|
for canonical, canonical_type in canonical_entities:
|
||||||
|
similarity = fuzzy_similarity(entity_name, canonical)
|
||||||
|
if similarity >= config.fuzzy_threshold:
|
||||||
|
# Type compatibility check: skip if types differ and both known.
|
||||||
|
# Empty/unknown types are treated as compatible to avoid
|
||||||
|
# blocking legitimate matches when type info is incomplete.
|
||||||
|
types_compatible = (
|
||||||
|
not entity_type
|
||||||
|
or not canonical_type
|
||||||
|
or entity_type == canonical_type
|
||||||
|
)
|
||||||
|
if not types_compatible:
|
||||||
|
logger.debug(
|
||||||
|
f"Pre-resolution (fuzzy {similarity:.2f}): SKIPPED "
|
||||||
|
f"'{entity_name}' ({entity_type}) → "
|
||||||
|
f"'{canonical}' ({canonical_type}) - type mismatch"
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Numeric suffix check: skip if names have different numbers
|
||||||
|
# This prevents false matches like "Interleukin-4" → "Interleukin-13"
|
||||||
|
# where fuzzy similarity is high (88.9%) but entities are distinct
|
||||||
|
if _has_different_numeric_suffix(entity_name, canonical):
|
||||||
|
logger.debug(
|
||||||
|
f"Pre-resolution (fuzzy {similarity:.2f}): SKIPPED "
|
||||||
|
f"'{entity_name}' → '{canonical}' - different numeric suffix"
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Accept the fuzzy match - emit warning for review
|
||||||
|
resolution_map[entity_name] = canonical
|
||||||
|
confidence_map[entity_name] = (
|
||||||
|
similarity # Use actual similarity score
|
||||||
|
)
|
||||||
|
etype_display = entity_type or "unknown"
|
||||||
|
ctype_display = canonical_type or "unknown"
|
||||||
|
logger.warning(
|
||||||
|
f"Fuzzy pre-resolution accepted: '{entity_name}' → "
|
||||||
|
f"'{canonical}' (similarity={similarity:.3f}, "
|
||||||
|
f"types: {etype_display}→{ctype_display}). "
|
||||||
|
f"Review for correctness; adjust fuzzy_threshold or "
|
||||||
|
f"disable fuzzy_pre_resolution_enabled if needed."
|
||||||
|
)
|
||||||
|
matched = True
|
||||||
|
break
|
||||||
|
|
||||||
|
if matched:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Layer 3: Check existing VDB for cross-document deduplication
|
||||||
|
if entity_vdb and llm_fn:
|
||||||
|
try:
|
||||||
|
result = await resolve_entity_with_vdb(
|
||||||
|
entity_name, entity_vdb, llm_fn, config
|
||||||
|
)
|
||||||
|
if result.action == "match" and result.matched_entity:
|
||||||
|
resolution_map[entity_name] = result.matched_entity
|
||||||
|
confidence_map[entity_name] = (
|
||||||
|
result.confidence
|
||||||
|
) # Use VDB result confidence
|
||||||
|
# Add canonical from VDB so batch entities can match it.
|
||||||
|
# VDB matches don't have type info available, use empty.
|
||||||
|
canonical_entities.append((result.matched_entity, ""))
|
||||||
|
logger.debug(
|
||||||
|
f"Pre-resolution (VDB {result.method}): "
|
||||||
|
f"'{entity_name}' → '{result.matched_entity}'"
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug(
|
||||||
|
f"Pre-resolution VDB check failed for '{entity_name}': {e}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# No match found - this is a new canonical entity
|
||||||
|
canonical_entities.append((entity_name, entity_type))
|
||||||
|
|
||||||
|
if resolution_map:
|
||||||
|
logger.info(
|
||||||
|
f"Pre-resolution: {len(resolution_map)} entities mapped to canonical forms"
|
||||||
|
)
|
||||||
|
|
||||||
|
return resolution_map, confidence_map
|
||||||
|
|
||||||
|
|
||||||
async def _merge_nodes_then_upsert(
|
async def _merge_nodes_then_upsert(
|
||||||
entity_name: str,
|
entity_name: str,
|
||||||
nodes_data: list[dict],
|
nodes_data: list[dict],
|
||||||
|
|
@ -1600,8 +1782,128 @@ async def _merge_nodes_then_upsert(
|
||||||
pipeline_status_lock=None,
|
pipeline_status_lock=None,
|
||||||
llm_response_cache: BaseKVStorage | None = None,
|
llm_response_cache: BaseKVStorage | None = None,
|
||||||
entity_chunks_storage: BaseKVStorage | None = None,
|
entity_chunks_storage: BaseKVStorage | None = None,
|
||||||
):
|
pre_resolution_map: dict[str, str] | None = None,
|
||||||
"""Get existing nodes from knowledge graph use name,if exists, merge data, else create, then upsert."""
|
) -> tuple[dict, str | None]:
|
||||||
|
"""Get existing nodes from knowledge graph use name,if exists, merge data, else create, then upsert.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple of (node_data, original_entity_name). original_entity_name is set if
|
||||||
|
entity resolution changed the name (e.g., "Dupixant" → "Dupixent"),
|
||||||
|
otherwise None.
|
||||||
|
"""
|
||||||
|
original_entity_name = entity_name # Track original before resolution
|
||||||
|
|
||||||
|
# Apply pre-resolution map immediately (prevents race conditions in parallel processing)
|
||||||
|
pre_resolved = False
|
||||||
|
if pre_resolution_map and entity_name in pre_resolution_map:
|
||||||
|
entity_name = pre_resolution_map[entity_name]
|
||||||
|
pre_resolved = True
|
||||||
|
logger.debug(
|
||||||
|
f"Applied pre-resolution: '{original_entity_name}' → '{entity_name}'"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Entity Resolution: Resolve new entity against existing entities
|
||||||
|
# Skip if already pre-resolved (to avoid redundant VDB queries)
|
||||||
|
entity_resolution_config_raw = global_config.get("entity_resolution_config")
|
||||||
|
entity_resolution_config = None
|
||||||
|
if entity_resolution_config_raw:
|
||||||
|
# Handle both dict (from asdict() serialization) and EntityResolutionConfig instances
|
||||||
|
if isinstance(entity_resolution_config_raw, EntityResolutionConfig):
|
||||||
|
entity_resolution_config = entity_resolution_config_raw
|
||||||
|
elif isinstance(entity_resolution_config_raw, dict):
|
||||||
|
try:
|
||||||
|
entity_resolution_config = EntityResolutionConfig(
|
||||||
|
**entity_resolution_config_raw
|
||||||
|
)
|
||||||
|
except TypeError as e:
|
||||||
|
logger.warning(
|
||||||
|
f"Invalid entity_resolution_config: {e}. "
|
||||||
|
f"Config: {entity_resolution_config_raw}. Skipping resolution."
|
||||||
|
)
|
||||||
|
|
||||||
|
# Safely check if entity resolution is enabled, handling both object and dict forms
|
||||||
|
def _is_resolution_enabled(config) -> bool:
|
||||||
|
if config is None:
|
||||||
|
return False
|
||||||
|
if isinstance(config, dict):
|
||||||
|
return config.get("enabled", False)
|
||||||
|
return getattr(config, "enabled", False)
|
||||||
|
|
||||||
|
# Skip VDB resolution if entity was already pre-resolved (prevents redundant queries)
|
||||||
|
if (
|
||||||
|
_is_resolution_enabled(entity_resolution_config)
|
||||||
|
and entity_vdb
|
||||||
|
and not pre_resolved
|
||||||
|
):
|
||||||
|
original_name = entity_name
|
||||||
|
workspace = global_config.get("workspace", "")
|
||||||
|
# Try knowledge_graph_inst.db first (more reliable), fallback to entity_vdb.db
|
||||||
|
db = getattr(knowledge_graph_inst, "db", None) or getattr(
|
||||||
|
entity_vdb, "db", None
|
||||||
|
)
|
||||||
|
|
||||||
|
# Layer 0: Check alias cache first (PostgreSQL-only - requires db connection)
|
||||||
|
# Note: Alias caching is only available when using PostgreSQL storage backend
|
||||||
|
if db is not None:
|
||||||
|
try:
|
||||||
|
cached = await get_cached_alias(original_name, db, workspace)
|
||||||
|
if cached:
|
||||||
|
canonical, method, _ = cached
|
||||||
|
logger.debug(
|
||||||
|
f"Alias cache hit: '{original_name}' → '{canonical}' "
|
||||||
|
f"(method: {method})"
|
||||||
|
)
|
||||||
|
entity_name = canonical
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(
|
||||||
|
f"Entity resolution cache lookup failed for '{original_name}' "
|
||||||
|
f"(workspace: {workspace}): {type(e).__name__}: {e}. "
|
||||||
|
"Continuing without cache."
|
||||||
|
)
|
||||||
|
|
||||||
|
# Layers 1-3: Full VDB resolution (if not found in cache)
|
||||||
|
if entity_name == original_name:
|
||||||
|
llm_fn = global_config.get("llm_model_func")
|
||||||
|
if llm_fn:
|
||||||
|
try:
|
||||||
|
resolution = await resolve_entity_with_vdb(
|
||||||
|
entity_name,
|
||||||
|
entity_vdb,
|
||||||
|
llm_fn,
|
||||||
|
entity_resolution_config,
|
||||||
|
)
|
||||||
|
if resolution.action == "match" and resolution.matched_entity:
|
||||||
|
logger.info(
|
||||||
|
f"Entity resolution: '{entity_name}' → '{resolution.matched_entity}' "
|
||||||
|
f"(method: {resolution.method}, confidence: {resolution.confidence:.2f})"
|
||||||
|
)
|
||||||
|
entity_name = resolution.matched_entity
|
||||||
|
|
||||||
|
# Store in alias cache for next time (PostgreSQL-only)
|
||||||
|
# Note: Alias caching requires PostgreSQL storage backend
|
||||||
|
if db is not None:
|
||||||
|
try:
|
||||||
|
await store_alias(
|
||||||
|
original_name,
|
||||||
|
entity_name,
|
||||||
|
resolution.method,
|
||||||
|
resolution.confidence,
|
||||||
|
db,
|
||||||
|
workspace,
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(
|
||||||
|
f"Failed to store entity alias '{original_name}' → '{entity_name}' "
|
||||||
|
f"(workspace: {workspace}): {type(e).__name__}: {e}. "
|
||||||
|
"Resolution succeeded but cache not updated."
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(
|
||||||
|
f"Entity resolution failed for '{original_name}' "
|
||||||
|
f"(workspace: {workspace}): {type(e).__name__}: {e}. "
|
||||||
|
"Continuing with original entity name."
|
||||||
|
)
|
||||||
|
|
||||||
already_entity_types = []
|
already_entity_types = []
|
||||||
already_source_ids = []
|
already_source_ids = []
|
||||||
already_description = []
|
already_description = []
|
||||||
|
|
@ -1865,7 +2167,12 @@ async def _merge_nodes_then_upsert(
|
||||||
max_retries=3,
|
max_retries=3,
|
||||||
retry_delay=0.1,
|
retry_delay=0.1,
|
||||||
)
|
)
|
||||||
return node_data
|
|
||||||
|
# Return original name if resolution changed it, None otherwise
|
||||||
|
resolved_from = (
|
||||||
|
original_entity_name if entity_name != original_entity_name else None
|
||||||
|
)
|
||||||
|
return node_data, resolved_from
|
||||||
|
|
||||||
|
|
||||||
async def _merge_edges_then_upsert(
|
async def _merge_edges_then_upsert(
|
||||||
|
|
@ -1882,7 +2189,13 @@ async def _merge_edges_then_upsert(
|
||||||
added_entities: list = None, # New parameter to track entities added during edge processing
|
added_entities: list = None, # New parameter to track entities added during edge processing
|
||||||
relation_chunks_storage: BaseKVStorage | None = None,
|
relation_chunks_storage: BaseKVStorage | None = None,
|
||||||
entity_chunks_storage: BaseKVStorage | None = None,
|
entity_chunks_storage: BaseKVStorage | None = None,
|
||||||
|
entity_resolution_map: dict[str, str] | None = None, # Map original→resolved names
|
||||||
):
|
):
|
||||||
|
# Apply entity resolution mapping to edge endpoints
|
||||||
|
if entity_resolution_map:
|
||||||
|
src_id = entity_resolution_map.get(src_id, src_id)
|
||||||
|
tgt_id = entity_resolution_map.get(tgt_id, tgt_id)
|
||||||
|
|
||||||
if src_id == tgt_id:
|
if src_id == tgt_id:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
@ -2472,6 +2785,76 @@ async def merge_nodes_and_edges(
|
||||||
graph_max_async = global_config.get("llm_model_max_async", 4) * 2
|
graph_max_async = global_config.get("llm_model_max_async", 4) * 2
|
||||||
semaphore = asyncio.Semaphore(graph_max_async)
|
semaphore = asyncio.Semaphore(graph_max_async)
|
||||||
|
|
||||||
|
# ===== Pre-Resolution Phase: Build entity resolution map =====
|
||||||
|
# This prevents race conditions when parallel workers process similar entities
|
||||||
|
# IMPORTANT: Include BOTH entity names AND relation endpoints to catch all duplicates
|
||||||
|
pre_resolution_map: dict[str, str] = {}
|
||||||
|
entity_resolution_config_raw = global_config.get("entity_resolution_config")
|
||||||
|
if entity_resolution_config_raw:
|
||||||
|
# Handle both dict (from asdict() serialization) and EntityResolutionConfig instances
|
||||||
|
config = None
|
||||||
|
if isinstance(entity_resolution_config_raw, EntityResolutionConfig):
|
||||||
|
config = entity_resolution_config_raw
|
||||||
|
elif isinstance(entity_resolution_config_raw, dict):
|
||||||
|
try:
|
||||||
|
config = EntityResolutionConfig(**entity_resolution_config_raw)
|
||||||
|
except TypeError as e:
|
||||||
|
logger.warning(
|
||||||
|
f"Invalid entity_resolution_config: {e}. "
|
||||||
|
f"Config: {entity_resolution_config_raw}. Skipping resolution."
|
||||||
|
)
|
||||||
|
if config and config.enabled:
|
||||||
|
llm_fn = global_config.get("llm_model_func")
|
||||||
|
# Build entity_types map for type-aware fuzzy matching.
|
||||||
|
# Use first non-empty type for entities with multiple occurrences.
|
||||||
|
entity_types: dict[str, str] = {}
|
||||||
|
for entity_name, entities in all_nodes.items():
|
||||||
|
for entity_data in entities:
|
||||||
|
etype = entity_data.get("entity_type", "")
|
||||||
|
if etype:
|
||||||
|
entity_types[entity_name] = etype
|
||||||
|
break
|
||||||
|
|
||||||
|
# Collect ALL entity names: from entities AND from relation endpoints
|
||||||
|
# This ensures relation endpoints like "EU Medicines Agency" get resolved
|
||||||
|
# against existing entities like "European Medicines Agency"
|
||||||
|
all_entity_names = set(all_nodes.keys())
|
||||||
|
for src_id, tgt_id in all_edges.keys():
|
||||||
|
all_entity_names.add(src_id)
|
||||||
|
all_entity_names.add(tgt_id)
|
||||||
|
|
||||||
|
pre_resolution_map, confidence_map = await _build_pre_resolution_map(
|
||||||
|
list(all_entity_names),
|
||||||
|
entity_types,
|
||||||
|
entity_vdb,
|
||||||
|
llm_fn,
|
||||||
|
config,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Cache pre-resolution aliases for future lookups (PostgreSQL-only)
|
||||||
|
# This ensures aliases discovered during batch processing are available
|
||||||
|
# for subsequent document ingestion without re-running resolution
|
||||||
|
db = getattr(knowledge_graph_inst, "db", None)
|
||||||
|
if db is not None and pre_resolution_map:
|
||||||
|
workspace = global_config.get("workspace", "")
|
||||||
|
for alias, canonical in pre_resolution_map.items():
|
||||||
|
# Don't cache self-references (entity → itself)
|
||||||
|
if alias.lower().strip() != canonical.lower().strip():
|
||||||
|
try:
|
||||||
|
await store_alias(
|
||||||
|
alias=alias,
|
||||||
|
canonical=canonical,
|
||||||
|
method="pre_resolution",
|
||||||
|
confidence=confidence_map.get(alias, 1.0),
|
||||||
|
db=db,
|
||||||
|
workspace=workspace,
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug(
|
||||||
|
f"Failed to cache pre-resolution alias "
|
||||||
|
f"'{alias}' → '{canonical}': {e}"
|
||||||
|
)
|
||||||
|
|
||||||
# ===== Phase 1: Process all entities concurrently =====
|
# ===== Phase 1: Process all entities concurrently =====
|
||||||
log_message = f"Phase 1: Processing {total_entities_count} entities from {doc_id} (async: {graph_max_async})"
|
log_message = f"Phase 1: Processing {total_entities_count} entities from {doc_id} (async: {graph_max_async})"
|
||||||
logger.info(log_message)
|
logger.info(log_message)
|
||||||
|
|
@ -2479,6 +2862,11 @@ async def merge_nodes_and_edges(
|
||||||
pipeline_status["latest_message"] = log_message
|
pipeline_status["latest_message"] = log_message
|
||||||
pipeline_status["history_messages"].append(log_message)
|
pipeline_status["history_messages"].append(log_message)
|
||||||
|
|
||||||
|
# Resolution map to track original→resolved entity names (e.g., "Dupixant"→"Dupixent")
|
||||||
|
# This will be used to remap edge endpoints in Phase 2
|
||||||
|
entity_resolution_map: dict[str, str] = {}
|
||||||
|
resolution_map_lock = asyncio.Lock()
|
||||||
|
|
||||||
async def _locked_process_entity_name(entity_name, entities):
|
async def _locked_process_entity_name(entity_name, entities):
|
||||||
async with semaphore:
|
async with semaphore:
|
||||||
# Check for cancellation before processing entity
|
# Check for cancellation before processing entity
|
||||||
|
|
@ -2496,7 +2884,7 @@ async def merge_nodes_and_edges(
|
||||||
):
|
):
|
||||||
try:
|
try:
|
||||||
logger.debug(f"Processing entity {entity_name}")
|
logger.debug(f"Processing entity {entity_name}")
|
||||||
entity_data = await _merge_nodes_then_upsert(
|
entity_data, resolved_from = await _merge_nodes_then_upsert(
|
||||||
entity_name,
|
entity_name,
|
||||||
entities,
|
entities,
|
||||||
knowledge_graph_inst,
|
knowledge_graph_inst,
|
||||||
|
|
@ -2506,8 +2894,15 @@ async def merge_nodes_and_edges(
|
||||||
pipeline_status_lock,
|
pipeline_status_lock,
|
||||||
llm_response_cache,
|
llm_response_cache,
|
||||||
entity_chunks_storage,
|
entity_chunks_storage,
|
||||||
|
pre_resolution_map,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Track resolution mapping for edge remapping in Phase 2
|
||||||
|
if resolved_from is not None:
|
||||||
|
resolved_to = entity_data.get("entity_name", entity_name)
|
||||||
|
async with resolution_map_lock:
|
||||||
|
entity_resolution_map[resolved_from] = resolved_to
|
||||||
|
|
||||||
return entity_data
|
return entity_data
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
@ -2617,6 +3012,7 @@ async def merge_nodes_and_edges(
|
||||||
added_entities, # Pass list to collect added entities
|
added_entities, # Pass list to collect added entities
|
||||||
relation_chunks_storage,
|
relation_chunks_storage,
|
||||||
entity_chunks_storage, # Add entity_chunks_storage parameter
|
entity_chunks_storage, # Add entity_chunks_storage parameter
|
||||||
|
entity_resolution_map, # Apply entity resolution to edge endpoints
|
||||||
)
|
)
|
||||||
|
|
||||||
if edge_data is None:
|
if edge_data is None:
|
||||||
|
|
@ -2649,9 +3045,36 @@ async def merge_nodes_and_edges(
|
||||||
raise prefixed_exception from e
|
raise prefixed_exception from e
|
||||||
|
|
||||||
# Create relationship processing tasks
|
# Create relationship processing tasks
|
||||||
edge_tasks = []
|
# Apply pre_resolution_map to edge endpoints to prevent duplicates from relation extraction
|
||||||
|
# Key fixes: sort for lock ordering, filter self-loops, deduplicate merged edges
|
||||||
|
resolved_edges: dict[tuple[str, str], list] = {}
|
||||||
for edge_key, edges in all_edges.items():
|
for edge_key, edges in all_edges.items():
|
||||||
task = asyncio.create_task(_locked_process_edges(edge_key, edges))
|
# Remap edge endpoints using pre-resolution map
|
||||||
|
# This catches cases like "EU Medicines Agency" → "European Medicines Agency"
|
||||||
|
resolved_src = pre_resolution_map.get(edge_key[0], edge_key[0])
|
||||||
|
resolved_tgt = pre_resolution_map.get(edge_key[1], edge_key[1])
|
||||||
|
|
||||||
|
# Skip self-loops created by resolution (e.g., both endpoints resolve to same entity)
|
||||||
|
if resolved_src == resolved_tgt:
|
||||||
|
logger.debug(
|
||||||
|
f"Skipping self-loop after resolution: {edge_key} → ({resolved_src}, {resolved_tgt})"
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Sort for consistent lock ordering (prevents deadlocks)
|
||||||
|
resolved_edge_key = tuple(sorted([resolved_src, resolved_tgt]))
|
||||||
|
|
||||||
|
# Merge edges that resolve to same key (deduplication)
|
||||||
|
if resolved_edge_key not in resolved_edges:
|
||||||
|
resolved_edges[resolved_edge_key] = []
|
||||||
|
resolved_edges[resolved_edge_key].extend(edges)
|
||||||
|
|
||||||
|
# Create tasks from deduplicated edges
|
||||||
|
edge_tasks = []
|
||||||
|
for resolved_edge_key, merged_edges in resolved_edges.items():
|
||||||
|
task = asyncio.create_task(
|
||||||
|
_locked_process_edges(resolved_edge_key, merged_edges)
|
||||||
|
)
|
||||||
edge_tasks.append(task)
|
edge_tasks.append(task)
|
||||||
|
|
||||||
# Execute relationship tasks with error handling
|
# Execute relationship tasks with error handling
|
||||||
|
|
|
||||||
|
|
@ -2,7 +2,6 @@ import { useState, useCallback, useEffect, useRef } from 'react'
|
||||||
import ThemeProvider from '@/components/ThemeProvider'
|
import ThemeProvider from '@/components/ThemeProvider'
|
||||||
import TabVisibilityProvider from '@/contexts/TabVisibilityProvider'
|
import TabVisibilityProvider from '@/contexts/TabVisibilityProvider'
|
||||||
import ApiKeyAlert from '@/components/ApiKeyAlert'
|
import ApiKeyAlert from '@/components/ApiKeyAlert'
|
||||||
import StatusIndicator from '@/components/status/StatusIndicator'
|
|
||||||
import { SiteInfo, webuiPrefix } from '@/lib/constants'
|
import { SiteInfo, webuiPrefix } from '@/lib/constants'
|
||||||
import { useBackendState, useAuthStore } from '@/stores/state'
|
import { useBackendState, useAuthStore } from '@/stores/state'
|
||||||
import { useSettingsStore } from '@/stores/settings'
|
import { useSettingsStore } from '@/stores/settings'
|
||||||
|
|
@ -218,7 +217,6 @@ function App() {
|
||||||
</TabsContent>
|
</TabsContent>
|
||||||
</div>
|
</div>
|
||||||
</Tabs>
|
</Tabs>
|
||||||
{enableHealthCheck && <StatusIndicator />}
|
|
||||||
<ApiKeyAlert open={apiKeyAlertOpen} onOpenChange={handleApiKeyAlertOpenChange} />
|
<ApiKeyAlert open={apiKeyAlertOpen} onOpenChange={handleApiKeyAlertOpenChange} />
|
||||||
</main>
|
</main>
|
||||||
)}
|
)}
|
||||||
|
|
|
||||||
|
|
@ -1,9 +1,7 @@
|
||||||
import { useState, useCallback } from 'react'
|
import { useCallback } from 'react'
|
||||||
import { Popover, PopoverContent, PopoverTrigger } from '@/components/ui/Popover'
|
|
||||||
import Button from '@/components/ui/Button'
|
import Button from '@/components/ui/Button'
|
||||||
import { Select, SelectContent, SelectItem, SelectTrigger, SelectValue } from '@/components/ui/Select'
|
|
||||||
import { useSettingsStore } from '@/stores/settings'
|
import { useSettingsStore } from '@/stores/settings'
|
||||||
import { PaletteIcon } from 'lucide-react'
|
import { SunIcon, MoonIcon } from 'lucide-react'
|
||||||
import { useTranslation } from 'react-i18next'
|
import { useTranslation } from 'react-i18next'
|
||||||
import { cn } from '@/lib/utils'
|
import { cn } from '@/lib/utils'
|
||||||
|
|
||||||
|
|
@ -12,63 +10,39 @@ interface AppSettingsProps {
|
||||||
}
|
}
|
||||||
|
|
||||||
export default function AppSettings({ className }: AppSettingsProps) {
|
export default function AppSettings({ className }: AppSettingsProps) {
|
||||||
const [opened, setOpened] = useState<boolean>(false)
|
|
||||||
const { t } = useTranslation()
|
const { t } = useTranslation()
|
||||||
|
|
||||||
const language = useSettingsStore.use.language()
|
|
||||||
const setLanguage = useSettingsStore.use.setLanguage()
|
|
||||||
|
|
||||||
const theme = useSettingsStore.use.theme()
|
const theme = useSettingsStore.use.theme()
|
||||||
const setTheme = useSettingsStore.use.setTheme()
|
const setTheme = useSettingsStore.use.setTheme()
|
||||||
|
|
||||||
const handleLanguageChange = useCallback((value: string) => {
|
// Compute effective theme for icon/tooltip display when theme is 'system'
|
||||||
setLanguage(value as 'en' | 'zh' | 'fr' | 'ar' | 'zh_TW')
|
const effectiveTheme = theme === 'system'
|
||||||
}, [setLanguage])
|
? (window.matchMedia('(prefers-color-scheme: dark)').matches ? 'dark' : 'light')
|
||||||
|
: theme
|
||||||
|
|
||||||
const handleThemeChange = useCallback((value: string) => {
|
const handleThemeToggle = useCallback(() => {
|
||||||
setTheme(value as 'light' | 'dark' | 'system')
|
if (theme === 'system') {
|
||||||
}, [setTheme])
|
// Detect actual system preference and toggle to opposite
|
||||||
|
const isDark = window.matchMedia('(prefers-color-scheme: dark)').matches
|
||||||
|
setTheme(isDark ? 'light' : 'dark')
|
||||||
|
} else {
|
||||||
|
setTheme(theme === 'dark' ? 'light' : 'dark')
|
||||||
|
}
|
||||||
|
}, [theme, setTheme])
|
||||||
|
|
||||||
return (
|
return (
|
||||||
<Popover open={opened} onOpenChange={setOpened}>
|
<Button
|
||||||
<PopoverTrigger asChild>
|
variant="ghost"
|
||||||
<Button variant="ghost" size="icon" className={cn('h-9 w-9', className)}>
|
size="icon"
|
||||||
<PaletteIcon className="h-5 w-5" />
|
className={cn('h-9 w-9', className)}
|
||||||
</Button>
|
onClick={handleThemeToggle}
|
||||||
</PopoverTrigger>
|
tooltip={effectiveTheme === 'dark' ? t('settings.light') : t('settings.dark')}
|
||||||
<PopoverContent side="bottom" align="end" className="w-56">
|
>
|
||||||
<div className="flex flex-col gap-4">
|
{effectiveTheme === 'dark' ? (
|
||||||
<div className="flex flex-col gap-2">
|
<MoonIcon className="h-5 w-5" />
|
||||||
<label className="text-sm font-medium">{t('settings.language')}</label>
|
) : (
|
||||||
<Select value={language} onValueChange={handleLanguageChange}>
|
<SunIcon className="h-5 w-5" />
|
||||||
<SelectTrigger>
|
)}
|
||||||
<SelectValue />
|
</Button>
|
||||||
</SelectTrigger>
|
|
||||||
<SelectContent>
|
|
||||||
<SelectItem value="en">English</SelectItem>
|
|
||||||
<SelectItem value="zh">中文</SelectItem>
|
|
||||||
<SelectItem value="fr">Français</SelectItem>
|
|
||||||
<SelectItem value="ar">العربية</SelectItem>
|
|
||||||
<SelectItem value="zh_TW">繁體中文</SelectItem>
|
|
||||||
</SelectContent>
|
|
||||||
</Select>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<div className="flex flex-col gap-2">
|
|
||||||
<label className="text-sm font-medium">{t('settings.theme')}</label>
|
|
||||||
<Select value={theme} onValueChange={handleThemeChange}>
|
|
||||||
<SelectTrigger>
|
|
||||||
<SelectValue />
|
|
||||||
</SelectTrigger>
|
|
||||||
<SelectContent>
|
|
||||||
<SelectItem value="light">{t('settings.light')}</SelectItem>
|
|
||||||
<SelectItem value="dark">{t('settings.dark')}</SelectItem>
|
|
||||||
<SelectItem value="system">{t('settings.system')}</SelectItem>
|
|
||||||
</SelectContent>
|
|
||||||
</Select>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
</PopoverContent>
|
|
||||||
</Popover>
|
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -4,7 +4,7 @@ import { useEffect, useState } from 'react'
|
||||||
import StatusDialog from './StatusDialog'
|
import StatusDialog from './StatusDialog'
|
||||||
import { useTranslation } from 'react-i18next'
|
import { useTranslation } from 'react-i18next'
|
||||||
|
|
||||||
const StatusIndicator = () => {
|
const StatusIndicator = ({ className }: { className?: string }) => {
|
||||||
const { t } = useTranslation()
|
const { t } = useTranslation()
|
||||||
const health = useBackendState.use.health()
|
const health = useBackendState.use.health()
|
||||||
const lastCheckTime = useBackendState.use.lastCheckTime()
|
const lastCheckTime = useBackendState.use.lastCheckTime()
|
||||||
|
|
@ -20,7 +20,7 @@ const StatusIndicator = () => {
|
||||||
}, [lastCheckTime])
|
}, [lastCheckTime])
|
||||||
|
|
||||||
return (
|
return (
|
||||||
<div className="fixed right-4 bottom-4 flex items-center gap-2 opacity-80 select-none">
|
<div className={cn("flex items-center gap-2 opacity-80 select-none", className)}>
|
||||||
<div
|
<div
|
||||||
className="flex cursor-pointer items-center gap-2"
|
className="flex cursor-pointer items-center gap-2"
|
||||||
onClick={() => setDialogOpen(true)}
|
onClick={() => setDialogOpen(true)}
|
||||||
|
|
|
||||||
|
|
@ -1,13 +1,14 @@
|
||||||
import Button from '@/components/ui/Button'
|
import Button from '@/components/ui/Button'
|
||||||
import { SiteInfo, webuiPrefix } from '@/lib/constants'
|
import { webuiPrefix } from '@/lib/constants'
|
||||||
import AppSettings from '@/components/AppSettings'
|
import AppSettings from '@/components/AppSettings'
|
||||||
|
import StatusIndicator from '@/components/status/StatusIndicator'
|
||||||
import { TabsList, TabsTrigger } from '@/components/ui/Tabs'
|
import { TabsList, TabsTrigger } from '@/components/ui/Tabs'
|
||||||
import { useSettingsStore } from '@/stores/settings'
|
import { useSettingsStore } from '@/stores/settings'
|
||||||
import { useAuthStore } from '@/stores/state'
|
import { useAuthStore } from '@/stores/state'
|
||||||
import { cn } from '@/lib/utils'
|
import { cn } from '@/lib/utils'
|
||||||
import { useTranslation } from 'react-i18next'
|
import { useTranslation } from 'react-i18next'
|
||||||
import { navigationService } from '@/services/navigation'
|
import { navigationService } from '@/services/navigation'
|
||||||
import { ZapIcon, GithubIcon, LogOutIcon } from 'lucide-react'
|
import { ZapIcon, LogOutIcon } from 'lucide-react'
|
||||||
import { Tooltip, TooltipContent, TooltipProvider, TooltipTrigger } from '@/components/ui/Tooltip'
|
import { Tooltip, TooltipContent, TooltipProvider, TooltipTrigger } from '@/components/ui/Tooltip'
|
||||||
|
|
||||||
interface NavigationTabProps {
|
interface NavigationTabProps {
|
||||||
|
|
@ -56,17 +57,8 @@ function TabsNavigation() {
|
||||||
|
|
||||||
export default function SiteHeader() {
|
export default function SiteHeader() {
|
||||||
const { t } = useTranslation()
|
const { t } = useTranslation()
|
||||||
const { isGuestMode, coreVersion, apiVersion, username, webuiTitle, webuiDescription } = useAuthStore()
|
const { isGuestMode, username, webuiTitle, webuiDescription } = useAuthStore()
|
||||||
|
const enableHealthCheck = useSettingsStore.use.enableHealthCheck()
|
||||||
const versionDisplay = (coreVersion && apiVersion)
|
|
||||||
? `${coreVersion}/${apiVersion}`
|
|
||||||
: null;
|
|
||||||
|
|
||||||
// Check if frontend needs rebuild (apiVersion ends with warning symbol)
|
|
||||||
const hasWarning = apiVersion?.endsWith('⚠️');
|
|
||||||
const versionTooltip = hasWarning
|
|
||||||
? t('header.frontendNeedsRebuild')
|
|
||||||
: versionDisplay ? `v${versionDisplay}` : '';
|
|
||||||
|
|
||||||
const handleLogout = () => {
|
const handleLogout = () => {
|
||||||
navigationService.navigateToLogin();
|
navigationService.navigateToLogin();
|
||||||
|
|
@ -77,7 +69,6 @@ export default function SiteHeader() {
|
||||||
<div className="min-w-[200px] w-auto flex items-center">
|
<div className="min-w-[200px] w-auto flex items-center">
|
||||||
<a href={webuiPrefix} className="flex items-center gap-2">
|
<a href={webuiPrefix} className="flex items-center gap-2">
|
||||||
<ZapIcon className="size-4 text-emerald-400" aria-hidden="true" />
|
<ZapIcon className="size-4 text-emerald-400" aria-hidden="true" />
|
||||||
<span className="font-bold md:inline-block">{SiteInfo.name}</span>
|
|
||||||
</a>
|
</a>
|
||||||
{webuiTitle && (
|
{webuiTitle && (
|
||||||
<div className="flex items-center">
|
<div className="flex items-center">
|
||||||
|
|
@ -111,25 +102,7 @@ export default function SiteHeader() {
|
||||||
|
|
||||||
<nav className="w-[200px] flex items-center justify-end">
|
<nav className="w-[200px] flex items-center justify-end">
|
||||||
<div className="flex items-center gap-2">
|
<div className="flex items-center gap-2">
|
||||||
{versionDisplay && (
|
{enableHealthCheck && <StatusIndicator />}
|
||||||
<TooltipProvider>
|
|
||||||
<Tooltip>
|
|
||||||
<TooltipTrigger asChild>
|
|
||||||
<span className="text-xs text-gray-500 dark:text-gray-400 mr-1 cursor-default">
|
|
||||||
v{versionDisplay}
|
|
||||||
</span>
|
|
||||||
</TooltipTrigger>
|
|
||||||
<TooltipContent side="bottom">
|
|
||||||
{versionTooltip}
|
|
||||||
</TooltipContent>
|
|
||||||
</Tooltip>
|
|
||||||
</TooltipProvider>
|
|
||||||
)}
|
|
||||||
<Button variant="ghost" size="icon" side="bottom" tooltip={t('header.projectRepository')}>
|
|
||||||
<a href={SiteInfo.github} target="_blank" rel="noopener noreferrer">
|
|
||||||
<GithubIcon className="size-4" aria-hidden="true" />
|
|
||||||
</a>
|
|
||||||
</Button>
|
|
||||||
<AppSettings />
|
<AppSettings />
|
||||||
{!isGuestMode && (
|
{!isGuestMode && (
|
||||||
<Button
|
<Button
|
||||||
|
|
|
||||||
|
|
@ -87,7 +87,7 @@ interface SettingsState {
|
||||||
const useSettingsStoreBase = create<SettingsState>()(
|
const useSettingsStoreBase = create<SettingsState>()(
|
||||||
persist(
|
persist(
|
||||||
(set) => ({
|
(set) => ({
|
||||||
theme: 'system',
|
theme: 'light',
|
||||||
language: 'en',
|
language: 'en',
|
||||||
showPropertyPanel: true,
|
showPropertyPanel: true,
|
||||||
showNodeSearchBar: true,
|
showNodeSearchBar: true,
|
||||||
|
|
@ -238,7 +238,7 @@ const useSettingsStoreBase = create<SettingsState>()(
|
||||||
{
|
{
|
||||||
name: 'settings-storage',
|
name: 'settings-storage',
|
||||||
storage: createJSONStorage(() => localStorage),
|
storage: createJSONStorage(() => localStorage),
|
||||||
version: 19,
|
version: 20,
|
||||||
migrate: (state: any, version: number) => {
|
migrate: (state: any, version: number) => {
|
||||||
if (version < 2) {
|
if (version < 2) {
|
||||||
state.showEdgeLabel = false
|
state.showEdgeLabel = false
|
||||||
|
|
@ -341,6 +341,15 @@ const useSettingsStoreBase = create<SettingsState>()(
|
||||||
delete state.querySettings.response_type
|
delete state.querySettings.response_type
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (version < 20) {
|
||||||
|
// Only set defaults if values are missing (preserve user preference)
|
||||||
|
if (!state.theme) {
|
||||||
|
state.theme = 'light'
|
||||||
|
}
|
||||||
|
if (!state.language) {
|
||||||
|
state.language = 'en'
|
||||||
|
}
|
||||||
|
}
|
||||||
return state
|
return state
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
1
tests/test_entity_resolution/__init__.py
Normal file
1
tests/test_entity_resolution/__init__.py
Normal file
|
|
@ -0,0 +1 @@
|
||||||
|
# Entity resolution tests
|
||||||
240
tests/test_entity_resolution/test_resolver.py
Normal file
240
tests/test_entity_resolution/test_resolver.py
Normal file
|
|
@ -0,0 +1,240 @@
|
||||||
|
"""
|
||||||
|
Unit tests for Entity Resolution
|
||||||
|
|
||||||
|
Tests the 3-layer approach with mock embed_fn and llm_fn.
|
||||||
|
No database or external services required.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from lightrag.entity_resolution import (
|
||||||
|
EntityResolutionConfig,
|
||||||
|
resolve_entity,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Mock embeddings - pre-computed for test entities
|
||||||
|
# These simulate what an embedding model would return
|
||||||
|
MOCK_EMBEDDINGS = {
|
||||||
|
# FDA and full name have ~0.67 similarity (based on real test)
|
||||||
|
"fda": [0.1, 0.2, 0.3, 0.4, 0.5],
|
||||||
|
"us food and drug administration": [0.15, 0.25, 0.28, 0.38, 0.52],
|
||||||
|
# Dupixent and dupilumab have ~0.63 similarity
|
||||||
|
"dupixent": [0.5, 0.6, 0.7, 0.8, 0.9],
|
||||||
|
"dupilumab": [0.48, 0.58, 0.72, 0.78, 0.88],
|
||||||
|
# Celebrex and Cerebyx are different (low similarity)
|
||||||
|
"celebrex": [0.9, 0.1, 0.2, 0.3, 0.4],
|
||||||
|
"cerebyx": [0.1, 0.9, 0.8, 0.7, 0.6],
|
||||||
|
# Default for unknown entities
|
||||||
|
"default": [0.0, 0.0, 0.0, 0.0, 0.0],
|
||||||
|
}
|
||||||
|
|
||||||
|
# Mock LLM responses
|
||||||
|
MOCK_LLM_RESPONSES = {
|
||||||
|
("fda", "us food and drug administration"): "YES",
|
||||||
|
("us food and drug administration", "fda"): "YES",
|
||||||
|
("dupixent", "dupilumab"): "YES",
|
||||||
|
("dupilumab", "dupixent"): "YES",
|
||||||
|
("heart attack", "myocardial infarction"): "YES",
|
||||||
|
("celebrex", "cerebyx"): "NO",
|
||||||
|
("metformin", "metoprolol"): "NO",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
async def mock_embed_fn(text: str) -> list[float]:
|
||||||
|
"""Mock embedding function."""
|
||||||
|
key = text.lower().strip()
|
||||||
|
return MOCK_EMBEDDINGS.get(key, MOCK_EMBEDDINGS["default"])
|
||||||
|
|
||||||
|
|
||||||
|
async def mock_llm_fn(prompt: str) -> str:
|
||||||
|
"""Mock LLM function that parses the prompt and returns YES/NO."""
|
||||||
|
# Extract term_a and term_b from the prompt
|
||||||
|
lines = prompt.strip().split("\n")
|
||||||
|
term_a = None
|
||||||
|
term_b = None
|
||||||
|
for line in lines:
|
||||||
|
if line.startswith("Term A:"):
|
||||||
|
term_a = line.replace("Term A:", "").strip().lower()
|
||||||
|
elif line.startswith("Term B:"):
|
||||||
|
term_b = line.replace("Term B:", "").strip().lower()
|
||||||
|
|
||||||
|
if term_a and term_b:
|
||||||
|
# Check both orderings
|
||||||
|
response = MOCK_LLM_RESPONSES.get((term_a, term_b))
|
||||||
|
if response is None:
|
||||||
|
response = MOCK_LLM_RESPONSES.get((term_b, term_a), "NO")
|
||||||
|
return response
|
||||||
|
return "NO"
|
||||||
|
|
||||||
|
|
||||||
|
# Test fixtures
|
||||||
|
@pytest.fixture
|
||||||
|
def existing_entities():
|
||||||
|
"""Existing entities in the knowledge graph."""
|
||||||
|
return [
|
||||||
|
(
|
||||||
|
"US Food and Drug Administration",
|
||||||
|
MOCK_EMBEDDINGS["us food and drug administration"],
|
||||||
|
),
|
||||||
|
("Dupixent", MOCK_EMBEDDINGS["dupixent"]),
|
||||||
|
("Celebrex", MOCK_EMBEDDINGS["celebrex"]),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def config():
|
||||||
|
"""Default resolution config."""
|
||||||
|
return EntityResolutionConfig()
|
||||||
|
|
||||||
|
|
||||||
|
# Layer 1: Case normalization tests
|
||||||
|
class TestCaseNormalization:
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_exact_match_same_case(self, existing_entities, config):
|
||||||
|
"""Exact match with same case."""
|
||||||
|
result = await resolve_entity(
|
||||||
|
"Dupixent",
|
||||||
|
existing_entities,
|
||||||
|
mock_embed_fn,
|
||||||
|
mock_llm_fn,
|
||||||
|
config,
|
||||||
|
)
|
||||||
|
assert result.action == "match"
|
||||||
|
assert result.matched_entity == "Dupixent"
|
||||||
|
assert result.method == "exact"
|
||||||
|
assert result.confidence == 1.0
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_exact_match_different_case(self, existing_entities, config):
|
||||||
|
"""DUPIXENT should match Dupixent via case normalization."""
|
||||||
|
result = await resolve_entity(
|
||||||
|
"DUPIXENT",
|
||||||
|
existing_entities,
|
||||||
|
mock_embed_fn,
|
||||||
|
mock_llm_fn,
|
||||||
|
config,
|
||||||
|
)
|
||||||
|
assert result.action == "match"
|
||||||
|
assert result.matched_entity == "Dupixent"
|
||||||
|
assert result.method == "exact"
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_exact_match_lowercase(self, existing_entities, config):
|
||||||
|
"""dupixent should match Dupixent."""
|
||||||
|
result = await resolve_entity(
|
||||||
|
"dupixent",
|
||||||
|
existing_entities,
|
||||||
|
mock_embed_fn,
|
||||||
|
mock_llm_fn,
|
||||||
|
config,
|
||||||
|
)
|
||||||
|
assert result.action == "match"
|
||||||
|
assert result.method == "exact"
|
||||||
|
|
||||||
|
|
||||||
|
# Layer 2: Fuzzy matching tests
|
||||||
|
class TestFuzzyMatching:
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_fuzzy_match_typo(self, existing_entities, config):
|
||||||
|
"""Dupixant (typo) should match Dupixent via fuzzy matching (88%)."""
|
||||||
|
result = await resolve_entity(
|
||||||
|
"Dupixant",
|
||||||
|
existing_entities,
|
||||||
|
mock_embed_fn,
|
||||||
|
mock_llm_fn,
|
||||||
|
config,
|
||||||
|
)
|
||||||
|
assert result.action == "match"
|
||||||
|
assert result.matched_entity == "Dupixent"
|
||||||
|
assert result.method == "fuzzy"
|
||||||
|
assert result.confidence >= 0.85
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_fuzzy_rejects_below_threshold(self, existing_entities, config):
|
||||||
|
"""Celebrex vs Cerebyx is 67% - should NOT fuzzy match."""
|
||||||
|
# Add Cerebyx as the query (Celebrex exists)
|
||||||
|
result = await resolve_entity(
|
||||||
|
"Cerebyx",
|
||||||
|
existing_entities,
|
||||||
|
mock_embed_fn,
|
||||||
|
mock_llm_fn,
|
||||||
|
config,
|
||||||
|
)
|
||||||
|
# Should not be fuzzy match (67% < 85%)
|
||||||
|
assert result.method != "fuzzy" or result.action == "new"
|
||||||
|
|
||||||
|
|
||||||
|
# Layer 3: LLM verification tests
|
||||||
|
class TestLLMVerification:
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_llm_matches_acronym(self, existing_entities, config):
|
||||||
|
"""FDA should match US Food and Drug Administration via LLM."""
|
||||||
|
result = await resolve_entity(
|
||||||
|
"FDA",
|
||||||
|
existing_entities,
|
||||||
|
mock_embed_fn,
|
||||||
|
mock_llm_fn,
|
||||||
|
config,
|
||||||
|
)
|
||||||
|
assert result.action == "match"
|
||||||
|
assert result.matched_entity == "US Food and Drug Administration"
|
||||||
|
assert result.method == "llm"
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_llm_matches_brand_generic(self, config):
|
||||||
|
"""Dupixent should match dupilumab via LLM."""
|
||||||
|
existing = [
|
||||||
|
("dupilumab", MOCK_EMBEDDINGS["dupilumab"]),
|
||||||
|
]
|
||||||
|
result = await resolve_entity(
|
||||||
|
"Dupixent",
|
||||||
|
existing,
|
||||||
|
mock_embed_fn,
|
||||||
|
mock_llm_fn,
|
||||||
|
config,
|
||||||
|
)
|
||||||
|
assert result.action == "match"
|
||||||
|
assert result.matched_entity == "dupilumab"
|
||||||
|
assert result.method == "llm"
|
||||||
|
|
||||||
|
|
||||||
|
# Edge cases
|
||||||
|
class TestEdgeCases:
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_empty_existing_entities(self, config):
|
||||||
|
"""New entity when no existing entities."""
|
||||||
|
result = await resolve_entity(
|
||||||
|
"NewEntity",
|
||||||
|
[],
|
||||||
|
mock_embed_fn,
|
||||||
|
mock_llm_fn,
|
||||||
|
config,
|
||||||
|
)
|
||||||
|
assert result.action == "new"
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_disabled_resolution(self, existing_entities):
|
||||||
|
"""Resolution disabled returns new."""
|
||||||
|
config = EntityResolutionConfig(enabled=False)
|
||||||
|
result = await resolve_entity(
|
||||||
|
"Dupixent",
|
||||||
|
existing_entities,
|
||||||
|
mock_embed_fn,
|
||||||
|
mock_llm_fn,
|
||||||
|
config,
|
||||||
|
)
|
||||||
|
assert result.action == "new"
|
||||||
|
assert result.method == "disabled"
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_genuinely_new_entity(self, existing_entities, config):
|
||||||
|
"""Completely new entity should return 'new'."""
|
||||||
|
result = await resolve_entity(
|
||||||
|
"CompletelyNewDrug",
|
||||||
|
existing_entities,
|
||||||
|
mock_embed_fn,
|
||||||
|
mock_llm_fn,
|
||||||
|
config,
|
||||||
|
)
|
||||||
|
assert result.action == "new"
|
||||||
|
assert result.method == "none"
|
||||||
Loading…
Add table
Reference in a new issue