LightRAG/lightrag/entity_resolution/resolver.py
clssck 69358d830d test(lightrag,examples,api): comprehensive ruff formatting and type hints
Format entire codebase with ruff and add type hints across all modules:
- Apply ruff formatting to all Python files (121 files, 17K insertions)
- Add type hints to function signatures throughout lightrag core and API
- Update test suite with improved type annotations and docstrings
- Add pyrightconfig.json for static type checking configuration
- Create prompt_optimized.py and test_extraction_prompt_ab.py test files
- Update ruff.toml and .gitignore for improved linting configuration
- Standardize code style across examples, reproduce scripts, and utilities
2025-12-05 15:17:06 +01:00

334 lines
11 KiB
Python

"""Entity Resolution - 3-Layer Approach
Layer 1: Case normalization (exact match)
Layer 2: Fuzzy string matching (>85% = typos)
Layer 3: Vector similarity + LLM verification (semantic matches)
Uses the same LLM that LightRAG is configured with.
"""
from collections.abc import Awaitable, Callable
from dataclasses import dataclass
from difflib import SequenceMatcher
import numpy as np
from lightrag.utils import logger
from .config import DEFAULT_CONFIG, EntityResolutionConfig
@dataclass
class ResolutionResult:
"""Result of entity resolution attempt."""
action: str # "match" | "new"
matched_entity: str | None
confidence: float
method: str # "exact" | "fuzzy" | "llm" | "none" | "disabled"
def cosine_similarity(a: list[float], b: list[float]) -> float:
"""Calculate cosine similarity between two vectors."""
a_arr, b_arr = np.array(a), np.array(b)
norm_a, norm_b = np.linalg.norm(a_arr), np.linalg.norm(b_arr)
if norm_a == 0 or norm_b == 0:
return 0.0
return float(np.dot(a_arr, b_arr) / (norm_a * norm_b))
def fuzzy_similarity(a: str, b: str) -> float:
"""Calculate fuzzy string similarity (0-1)."""
return SequenceMatcher(None, a.lower().strip(), b.lower().strip()).ratio()
def find_vector_candidates(
query_embedding: list[float],
existing_entities: list[tuple[str, list[float]]],
threshold: float,
) -> list[tuple[str, float]]:
"""Find entities with vector similarity above threshold."""
candidates = []
for name, embedding in existing_entities:
sim = cosine_similarity(query_embedding, embedding)
if sim >= threshold:
candidates.append((name, sim))
# Sort by similarity descending
candidates.sort(key=lambda x: x[1], reverse=True)
return candidates
async def llm_verify(
term_a: str,
term_b: str,
llm_fn: Callable[[str], Awaitable[str]],
prompt_template: str,
) -> bool:
"""Ask LLM if two terms refer to the same entity.
Uses strict parsing with exact token matching only. Accepted responses:
- Positive: "YES", "TRUE", "SAME", "MATCH"
- Negative: "NO", "FALSE", "DIFFERENT", "NOT SAME"
Any other response defaults to False to avoid false positive merges.
"""
prompt = prompt_template.format(term_a=term_a, term_b=term_b)
response = await llm_fn(prompt)
# Normalize response: strip whitespace, take first line only
normalized = response.strip().split('\n')[0].strip().upper()
# Remove common trailing punctuation
normalized = normalized.rstrip('.!,')
# Only accept exact tokens (no prefix/substring matching)
if normalized in ('YES', 'TRUE', 'SAME', 'MATCH'):
return True
if normalized in ('NO', 'FALSE', 'DIFFERENT', 'NOT SAME'):
return False
# Default to False for ambiguous responses (safer than false positive)
return False
async def resolve_entity(
entity_name: str,
existing_entities: list[tuple[str, list[float]]],
embed_fn: Callable[[str], Awaitable[list[float]]],
llm_fn: Callable[[str], Awaitable[str]],
config: EntityResolutionConfig = DEFAULT_CONFIG,
) -> ResolutionResult:
"""Resolve an entity against existing entities using 3-layer approach.
Args:
entity_name: The new entity name to resolve
existing_entities: List of (name, embedding) tuples for existing entities
embed_fn: Async function to get embedding for a string (same as LightRAG uses)
llm_fn: Async function to query LLM (same as LightRAG uses)
config: Resolution configuration
Returns:
ResolutionResult with action ("match" or "new"), matched entity,
confidence, and method used.
"""
if not config.enabled:
return ResolutionResult('new', None, 0.0, 'disabled')
if not existing_entities:
return ResolutionResult('new', None, 0.0, 'none')
normalized = entity_name.lower().strip()
# Layer 1: Case-insensitive exact match
for name, _ in existing_entities:
if name.lower().strip() == normalized:
return ResolutionResult('match', name, 1.0, 'exact')
# Layer 2: Fuzzy string matching (catches typos)
best_fuzzy_match = None
best_fuzzy_score = 0.0
for name, _ in existing_entities:
similarity = fuzzy_similarity(entity_name, name)
if similarity > best_fuzzy_score:
best_fuzzy_score = similarity
best_fuzzy_match = name
if best_fuzzy_score >= config.fuzzy_threshold:
return ResolutionResult('match', best_fuzzy_match, best_fuzzy_score, 'fuzzy')
# Layer 3: Vector similarity + LLM verification
embedding = await embed_fn(entity_name)
candidates = find_vector_candidates(
embedding,
existing_entities,
config.vector_threshold,
)
# Verify top candidates with LLM
for candidate_name, similarity in candidates[: config.max_candidates]:
is_same = await llm_verify(
entity_name,
candidate_name,
llm_fn,
config.llm_prompt_template,
)
if is_same:
return ResolutionResult('match', candidate_name, similarity, 'llm')
# No match found - this is a new entity
return ResolutionResult('new', None, 0.0, 'none')
async def resolve_entity_with_vdb(
entity_name: str,
entity_vdb, # BaseVectorStorage - imported dynamically to avoid circular imports
llm_fn: Callable[[str], Awaitable[str]],
config: EntityResolutionConfig = DEFAULT_CONFIG,
) -> ResolutionResult:
"""Resolve an entity using VDB for similarity search.
This is the production integration that uses LightRAG's vector database
directly instead of requiring pre-computed embeddings.
Args:
entity_name: The new entity name to resolve
entity_vdb: LightRAG's entity vector database (BaseVectorStorage)
llm_fn: Async function to query LLM (same as LightRAG uses)
config: Resolution configuration
Returns:
ResolutionResult with action ("match" or "new"), matched entity,
confidence, and method used.
"""
if not config.enabled:
return ResolutionResult('new', None, 0.0, 'disabled')
if entity_vdb is None:
return ResolutionResult('new', None, 0.0, 'none')
normalized = entity_name.lower().strip()
# Query VDB for similar entities - cast wide net, LLM will verify
# top_k is doubled to have enough candidates after filtering
try:
candidates = await entity_vdb.query(entity_name, top_k=config.max_candidates * 3)
except Exception as e:
# Log and skip resolution if VDB query fails
logger.debug(f"VDB query failed for '{entity_name}': {e}")
return ResolutionResult('new', None, 0.0, 'none')
if not candidates:
return ResolutionResult('new', None, 0.0, 'none')
# Layer 1: Case-insensitive exact match among candidates
for candidate in candidates:
candidate_name = candidate.get('entity_name')
if candidate_name and candidate_name.lower().strip() == normalized:
return ResolutionResult('match', candidate_name, 1.0, 'exact')
# Layer 2: Fuzzy string matching (catches typos)
best_fuzzy_match = None
best_fuzzy_score = 0.0
for candidate in candidates:
candidate_name = candidate.get('entity_name')
if not candidate_name:
continue
similarity = fuzzy_similarity(entity_name, candidate_name)
if similarity > best_fuzzy_score:
best_fuzzy_score = similarity
best_fuzzy_match = candidate_name
if best_fuzzy_score >= config.fuzzy_threshold:
return ResolutionResult('match', best_fuzzy_match, best_fuzzy_score, 'fuzzy')
# Layer 3: LLM verification on top candidates
verified_count = 0
for candidate in candidates:
if verified_count >= config.max_candidates:
break
candidate_name = candidate.get('entity_name')
if not candidate_name:
continue
is_same = await llm_verify(
entity_name,
candidate_name,
llm_fn,
config.llm_prompt_template,
)
verified_count += 1
if is_same:
# Use distance from VDB if available (converted to similarity)
similarity = 0.7 # Default confidence for LLM match
return ResolutionResult('match', candidate_name, similarity, 'llm')
# No match found - this is a new entity
return ResolutionResult('new', None, 0.0, 'none')
# --- Alias Cache Functions (PostgreSQL) ---
async def get_cached_alias(
alias: str,
db, # PostgresDB instance
workspace: str,
) -> tuple[str, str, float] | None:
"""Check if alias is already resolved in cache.
Args:
alias: The entity name to look up
db: PostgresDB instance with query method
workspace: Workspace for isolation
Returns:
Tuple of (canonical_entity, method, confidence) if found, None otherwise
"""
import logging
from lightrag.kg.postgres_impl import SQL_TEMPLATES
logger = logging.getLogger(__name__)
normalized_alias = alias.lower().strip()
sql = SQL_TEMPLATES['get_alias']
try:
result = await db.query(sql, params=[workspace, normalized_alias])
if result:
return (
result['canonical_entity'],
result['method'],
result['confidence'],
)
except Exception as e:
logger.debug(f'Alias cache lookup error: {e}')
return None
async def store_alias(
alias: str,
canonical: str,
method: str,
confidence: float,
db, # PostgresDB instance
workspace: str,
) -> None:
"""Store a resolution in the alias cache.
Args:
alias: The variant name (e.g., "FDA")
canonical: The resolved canonical name (e.g., "US Food and Drug Administration")
method: How it was resolved ('exact', 'fuzzy', 'llm', 'manual')
confidence: Resolution confidence (0-1)
db: PostgresDB instance with execute method
workspace: Workspace for isolation
"""
import logging
from datetime import datetime, timezone
from lightrag.kg.postgres_impl import SQL_TEMPLATES
logger = logging.getLogger(__name__)
normalized_alias = alias.lower().strip()
# Don't store self-referential aliases (e.g., "FDA" → "FDA")
if normalized_alias == canonical.lower().strip():
return
sql = SQL_TEMPLATES['upsert_alias']
try:
await db.execute(
sql,
data={
'workspace': workspace,
'alias': normalized_alias,
'canonical_entity': canonical,
'method': method,
'confidence': confidence,
'create_time': datetime.now(timezone.utc).replace(tzinfo=None),
},
)
except Exception as e:
logger.debug(f'Alias cache store error: {e}')