Format entire codebase with ruff and add type hints across all modules: - Apply ruff formatting to all Python files (121 files, 17K insertions) - Add type hints to function signatures throughout lightrag core and API - Update test suite with improved type annotations and docstrings - Add pyrightconfig.json for static type checking configuration - Create prompt_optimized.py and test_extraction_prompt_ab.py test files - Update ruff.toml and .gitignore for improved linting configuration - Standardize code style across examples, reproduce scripts, and utilities
334 lines
11 KiB
Python
334 lines
11 KiB
Python
"""Entity Resolution - 3-Layer Approach
|
|
|
|
Layer 1: Case normalization (exact match)
|
|
Layer 2: Fuzzy string matching (>85% = typos)
|
|
Layer 3: Vector similarity + LLM verification (semantic matches)
|
|
|
|
Uses the same LLM that LightRAG is configured with.
|
|
"""
|
|
|
|
from collections.abc import Awaitable, Callable
|
|
from dataclasses import dataclass
|
|
from difflib import SequenceMatcher
|
|
|
|
import numpy as np
|
|
|
|
from lightrag.utils import logger
|
|
|
|
from .config import DEFAULT_CONFIG, EntityResolutionConfig
|
|
|
|
|
|
@dataclass
|
|
class ResolutionResult:
|
|
"""Result of entity resolution attempt."""
|
|
|
|
action: str # "match" | "new"
|
|
matched_entity: str | None
|
|
confidence: float
|
|
method: str # "exact" | "fuzzy" | "llm" | "none" | "disabled"
|
|
|
|
|
|
def cosine_similarity(a: list[float], b: list[float]) -> float:
|
|
"""Calculate cosine similarity between two vectors."""
|
|
a_arr, b_arr = np.array(a), np.array(b)
|
|
norm_a, norm_b = np.linalg.norm(a_arr), np.linalg.norm(b_arr)
|
|
if norm_a == 0 or norm_b == 0:
|
|
return 0.0
|
|
return float(np.dot(a_arr, b_arr) / (norm_a * norm_b))
|
|
|
|
|
|
def fuzzy_similarity(a: str, b: str) -> float:
|
|
"""Calculate fuzzy string similarity (0-1)."""
|
|
return SequenceMatcher(None, a.lower().strip(), b.lower().strip()).ratio()
|
|
|
|
|
|
def find_vector_candidates(
|
|
query_embedding: list[float],
|
|
existing_entities: list[tuple[str, list[float]]],
|
|
threshold: float,
|
|
) -> list[tuple[str, float]]:
|
|
"""Find entities with vector similarity above threshold."""
|
|
candidates = []
|
|
for name, embedding in existing_entities:
|
|
sim = cosine_similarity(query_embedding, embedding)
|
|
if sim >= threshold:
|
|
candidates.append((name, sim))
|
|
# Sort by similarity descending
|
|
candidates.sort(key=lambda x: x[1], reverse=True)
|
|
return candidates
|
|
|
|
|
|
async def llm_verify(
|
|
term_a: str,
|
|
term_b: str,
|
|
llm_fn: Callable[[str], Awaitable[str]],
|
|
prompt_template: str,
|
|
) -> bool:
|
|
"""Ask LLM if two terms refer to the same entity.
|
|
|
|
Uses strict parsing with exact token matching only. Accepted responses:
|
|
- Positive: "YES", "TRUE", "SAME", "MATCH"
|
|
- Negative: "NO", "FALSE", "DIFFERENT", "NOT SAME"
|
|
Any other response defaults to False to avoid false positive merges.
|
|
"""
|
|
prompt = prompt_template.format(term_a=term_a, term_b=term_b)
|
|
response = await llm_fn(prompt)
|
|
|
|
# Normalize response: strip whitespace, take first line only
|
|
normalized = response.strip().split('\n')[0].strip().upper()
|
|
|
|
# Remove common trailing punctuation
|
|
normalized = normalized.rstrip('.!,')
|
|
|
|
# Only accept exact tokens (no prefix/substring matching)
|
|
if normalized in ('YES', 'TRUE', 'SAME', 'MATCH'):
|
|
return True
|
|
if normalized in ('NO', 'FALSE', 'DIFFERENT', 'NOT SAME'):
|
|
return False
|
|
|
|
# Default to False for ambiguous responses (safer than false positive)
|
|
return False
|
|
|
|
|
|
async def resolve_entity(
|
|
entity_name: str,
|
|
existing_entities: list[tuple[str, list[float]]],
|
|
embed_fn: Callable[[str], Awaitable[list[float]]],
|
|
llm_fn: Callable[[str], Awaitable[str]],
|
|
config: EntityResolutionConfig = DEFAULT_CONFIG,
|
|
) -> ResolutionResult:
|
|
"""Resolve an entity against existing entities using 3-layer approach.
|
|
|
|
Args:
|
|
entity_name: The new entity name to resolve
|
|
existing_entities: List of (name, embedding) tuples for existing entities
|
|
embed_fn: Async function to get embedding for a string (same as LightRAG uses)
|
|
llm_fn: Async function to query LLM (same as LightRAG uses)
|
|
config: Resolution configuration
|
|
|
|
Returns:
|
|
ResolutionResult with action ("match" or "new"), matched entity,
|
|
confidence, and method used.
|
|
"""
|
|
if not config.enabled:
|
|
return ResolutionResult('new', None, 0.0, 'disabled')
|
|
|
|
if not existing_entities:
|
|
return ResolutionResult('new', None, 0.0, 'none')
|
|
|
|
normalized = entity_name.lower().strip()
|
|
|
|
# Layer 1: Case-insensitive exact match
|
|
for name, _ in existing_entities:
|
|
if name.lower().strip() == normalized:
|
|
return ResolutionResult('match', name, 1.0, 'exact')
|
|
|
|
# Layer 2: Fuzzy string matching (catches typos)
|
|
best_fuzzy_match = None
|
|
best_fuzzy_score = 0.0
|
|
|
|
for name, _ in existing_entities:
|
|
similarity = fuzzy_similarity(entity_name, name)
|
|
if similarity > best_fuzzy_score:
|
|
best_fuzzy_score = similarity
|
|
best_fuzzy_match = name
|
|
|
|
if best_fuzzy_score >= config.fuzzy_threshold:
|
|
return ResolutionResult('match', best_fuzzy_match, best_fuzzy_score, 'fuzzy')
|
|
|
|
# Layer 3: Vector similarity + LLM verification
|
|
embedding = await embed_fn(entity_name)
|
|
candidates = find_vector_candidates(
|
|
embedding,
|
|
existing_entities,
|
|
config.vector_threshold,
|
|
)
|
|
|
|
# Verify top candidates with LLM
|
|
for candidate_name, similarity in candidates[: config.max_candidates]:
|
|
is_same = await llm_verify(
|
|
entity_name,
|
|
candidate_name,
|
|
llm_fn,
|
|
config.llm_prompt_template,
|
|
)
|
|
if is_same:
|
|
return ResolutionResult('match', candidate_name, similarity, 'llm')
|
|
|
|
# No match found - this is a new entity
|
|
return ResolutionResult('new', None, 0.0, 'none')
|
|
|
|
|
|
async def resolve_entity_with_vdb(
|
|
entity_name: str,
|
|
entity_vdb, # BaseVectorStorage - imported dynamically to avoid circular imports
|
|
llm_fn: Callable[[str], Awaitable[str]],
|
|
config: EntityResolutionConfig = DEFAULT_CONFIG,
|
|
) -> ResolutionResult:
|
|
"""Resolve an entity using VDB for similarity search.
|
|
|
|
This is the production integration that uses LightRAG's vector database
|
|
directly instead of requiring pre-computed embeddings.
|
|
|
|
Args:
|
|
entity_name: The new entity name to resolve
|
|
entity_vdb: LightRAG's entity vector database (BaseVectorStorage)
|
|
llm_fn: Async function to query LLM (same as LightRAG uses)
|
|
config: Resolution configuration
|
|
|
|
Returns:
|
|
ResolutionResult with action ("match" or "new"), matched entity,
|
|
confidence, and method used.
|
|
"""
|
|
if not config.enabled:
|
|
return ResolutionResult('new', None, 0.0, 'disabled')
|
|
|
|
if entity_vdb is None:
|
|
return ResolutionResult('new', None, 0.0, 'none')
|
|
|
|
normalized = entity_name.lower().strip()
|
|
|
|
# Query VDB for similar entities - cast wide net, LLM will verify
|
|
# top_k is doubled to have enough candidates after filtering
|
|
try:
|
|
candidates = await entity_vdb.query(entity_name, top_k=config.max_candidates * 3)
|
|
except Exception as e:
|
|
# Log and skip resolution if VDB query fails
|
|
logger.debug(f"VDB query failed for '{entity_name}': {e}")
|
|
return ResolutionResult('new', None, 0.0, 'none')
|
|
|
|
if not candidates:
|
|
return ResolutionResult('new', None, 0.0, 'none')
|
|
|
|
# Layer 1: Case-insensitive exact match among candidates
|
|
for candidate in candidates:
|
|
candidate_name = candidate.get('entity_name')
|
|
if candidate_name and candidate_name.lower().strip() == normalized:
|
|
return ResolutionResult('match', candidate_name, 1.0, 'exact')
|
|
|
|
# Layer 2: Fuzzy string matching (catches typos)
|
|
best_fuzzy_match = None
|
|
best_fuzzy_score = 0.0
|
|
|
|
for candidate in candidates:
|
|
candidate_name = candidate.get('entity_name')
|
|
if not candidate_name:
|
|
continue
|
|
similarity = fuzzy_similarity(entity_name, candidate_name)
|
|
if similarity > best_fuzzy_score:
|
|
best_fuzzy_score = similarity
|
|
best_fuzzy_match = candidate_name
|
|
|
|
if best_fuzzy_score >= config.fuzzy_threshold:
|
|
return ResolutionResult('match', best_fuzzy_match, best_fuzzy_score, 'fuzzy')
|
|
|
|
# Layer 3: LLM verification on top candidates
|
|
verified_count = 0
|
|
for candidate in candidates:
|
|
if verified_count >= config.max_candidates:
|
|
break
|
|
candidate_name = candidate.get('entity_name')
|
|
if not candidate_name:
|
|
continue
|
|
|
|
is_same = await llm_verify(
|
|
entity_name,
|
|
candidate_name,
|
|
llm_fn,
|
|
config.llm_prompt_template,
|
|
)
|
|
verified_count += 1
|
|
|
|
if is_same:
|
|
# Use distance from VDB if available (converted to similarity)
|
|
similarity = 0.7 # Default confidence for LLM match
|
|
return ResolutionResult('match', candidate_name, similarity, 'llm')
|
|
|
|
# No match found - this is a new entity
|
|
return ResolutionResult('new', None, 0.0, 'none')
|
|
|
|
|
|
# --- Alias Cache Functions (PostgreSQL) ---
|
|
|
|
|
|
async def get_cached_alias(
|
|
alias: str,
|
|
db, # PostgresDB instance
|
|
workspace: str,
|
|
) -> tuple[str, str, float] | None:
|
|
"""Check if alias is already resolved in cache.
|
|
|
|
Args:
|
|
alias: The entity name to look up
|
|
db: PostgresDB instance with query method
|
|
workspace: Workspace for isolation
|
|
|
|
Returns:
|
|
Tuple of (canonical_entity, method, confidence) if found, None otherwise
|
|
"""
|
|
import logging
|
|
|
|
from lightrag.kg.postgres_impl import SQL_TEMPLATES
|
|
|
|
logger = logging.getLogger(__name__)
|
|
normalized_alias = alias.lower().strip()
|
|
|
|
sql = SQL_TEMPLATES['get_alias']
|
|
try:
|
|
result = await db.query(sql, params=[workspace, normalized_alias])
|
|
if result:
|
|
return (
|
|
result['canonical_entity'],
|
|
result['method'],
|
|
result['confidence'],
|
|
)
|
|
except Exception as e:
|
|
logger.debug(f'Alias cache lookup error: {e}')
|
|
return None
|
|
|
|
|
|
async def store_alias(
|
|
alias: str,
|
|
canonical: str,
|
|
method: str,
|
|
confidence: float,
|
|
db, # PostgresDB instance
|
|
workspace: str,
|
|
) -> None:
|
|
"""Store a resolution in the alias cache.
|
|
|
|
Args:
|
|
alias: The variant name (e.g., "FDA")
|
|
canonical: The resolved canonical name (e.g., "US Food and Drug Administration")
|
|
method: How it was resolved ('exact', 'fuzzy', 'llm', 'manual')
|
|
confidence: Resolution confidence (0-1)
|
|
db: PostgresDB instance with execute method
|
|
workspace: Workspace for isolation
|
|
"""
|
|
import logging
|
|
from datetime import datetime, timezone
|
|
|
|
from lightrag.kg.postgres_impl import SQL_TEMPLATES
|
|
|
|
logger = logging.getLogger(__name__)
|
|
normalized_alias = alias.lower().strip()
|
|
|
|
# Don't store self-referential aliases (e.g., "FDA" → "FDA")
|
|
if normalized_alias == canonical.lower().strip():
|
|
return
|
|
|
|
sql = SQL_TEMPLATES['upsert_alias']
|
|
try:
|
|
await db.execute(
|
|
sql,
|
|
data={
|
|
'workspace': workspace,
|
|
'alias': normalized_alias,
|
|
'canonical_entity': canonical,
|
|
'method': method,
|
|
'confidence': confidence,
|
|
'create_time': datetime.now(timezone.utc).replace(tzinfo=None),
|
|
},
|
|
)
|
|
except Exception as e:
|
|
logger.debug(f'Alias cache store error: {e}')
|