diff --git a/docker-compose.test.yml b/docker-compose.test.yml new file mode 100644 index 00000000..91610100 --- /dev/null +++ b/docker-compose.test.yml @@ -0,0 +1,84 @@ +name: lightrag-entity-resolution-test + +services: + postgres: + container_name: lightrag-postgres + build: + context: ./docker/postgres-age-vector + dockerfile: Dockerfile + environment: + POSTGRES_DB: lightrag + POSTGRES_USER: lightrag + POSTGRES_PASSWORD: lightrag_pass + ports: + - "5433:5432" # Use 5433 to avoid conflict with agent-sdk postgres + volumes: + - pgdata_test:/var/lib/postgresql/data + healthcheck: + test: ["CMD-SHELL", "pg_isready -U lightrag -d lightrag"] + interval: 5s + timeout: 5s + retries: 5 + + lightrag: + container_name: lightrag-test + build: + context: . + dockerfile: Dockerfile + ports: + - "9622:9621" # Use 9622 to avoid conflict + volumes: + - ./data/rag_storage_test:/app/data/rag_storage + - ./data/inputs_test:/app/data/inputs + environment: + # Server + - HOST=0.0.0.0 + - PORT=9621 + - LOG_LEVEL=DEBUG + + # LLM (OpenAI) + - LLM_BINDING=openai + - LLM_MODEL=gpt-4o-mini + - LLM_BINDING_HOST=https://api.openai.com/v1 + - LLM_BINDING_API_KEY=${OPENAI_API_KEY} + + # Embedding + - EMBEDDING_BINDING=openai + - EMBEDDING_MODEL=text-embedding-3-small + - EMBEDDING_DIM=1536 + - EMBEDDING_BINDING_HOST=https://api.openai.com/v1 + - EMBEDDING_BINDING_API_KEY=${OPENAI_API_KEY} + + # Storage Configuration - Full PostgreSQL! + # Custom postgres image has pgvector + Apache AGE + - LIGHTRAG_KV_STORAGE=PGKVStorage + - LIGHTRAG_VECTOR_STORAGE=PGVectorStorage + - LIGHTRAG_GRAPH_STORAGE=PGGraphStorage + - LIGHTRAG_DOC_STATUS_STORAGE=PGDocStatusStorage + - POSTGRES_HOST=postgres + - POSTGRES_PORT=5432 + - POSTGRES_USER=lightrag + - POSTGRES_PASSWORD=lightrag_pass + - POSTGRES_DATABASE=lightrag + + # Entity Resolution - ENABLED! + - ENTITY_RESOLUTION_ENABLED=true + - ENTITY_RESOLUTION_FUZZY_THRESHOLD=0.85 + - ENTITY_RESOLUTION_VECTOR_THRESHOLD=0.5 + - ENTITY_RESOLUTION_MAX_CANDIDATES=3 + + # Processing + - MAX_ASYNC=4 + - CHUNK_SIZE=1200 + depends_on: + postgres: + condition: service_healthy + healthcheck: + test: ["CMD-SHELL", "curl -f http://localhost:9621/health || exit 1"] + interval: 10s + timeout: 5s + retries: 10 + start_period: 30s + +volumes: + pgdata_test: diff --git a/docker/postgres-age-vector/Dockerfile b/docker/postgres-age-vector/Dockerfile new file mode 100644 index 00000000..e9538d95 --- /dev/null +++ b/docker/postgres-age-vector/Dockerfile @@ -0,0 +1,26 @@ +# Start from pgvector image (has vector extension pre-built correctly) +FROM pgvector/pgvector:pg17 + +# Install build dependencies for AGE +RUN apt-get update && apt-get install -y \ + build-essential \ + git \ + postgresql-server-dev-17 \ + libreadline-dev \ + zlib1g-dev \ + flex \ + bison \ + && rm -rf /var/lib/apt/lists/* + +# Install Apache AGE 1.6.0 for PG17 +RUN cd /tmp \ + && git clone --branch release/PG17/1.6.0 https://github.com/apache/age.git \ + && cd age \ + && make \ + && make install \ + && rm -rf /tmp/age + +# Add initialization script to create extensions +RUN echo "CREATE EXTENSION IF NOT EXISTS vector;" > /docker-entrypoint-initdb.d/01-vector.sql \ + && echo "CREATE EXTENSION IF NOT EXISTS age;" > /docker-entrypoint-initdb.d/02-age.sql \ + && echo "SET search_path = ag_catalog, public;" >> /docker-entrypoint-initdb.d/02-age.sql diff --git a/env.example b/env.example index d30a03cb..351f864b 100644 --- a/env.example +++ b/env.example @@ -127,6 +127,16 @@ SUMMARY_LANGUAGE=English ### Entity types that the LLM will attempt to recognize # ENTITY_TYPES='["Person", "Creature", "Organization", "Location", "Event", "Concept", "Method", "Content", "Data", "Artifact", "NaturalObject"]' +########################################################### +### Entity Resolution Configuration +### Automatically deduplicates entities (e.g., "FDA" = "US Food and Drug Administration") +### Uses 3-layer approach: case normalization → fuzzy matching → LLM verification +########################################################### +# ENTITY_RESOLUTION_ENABLED=true +# ENTITY_RESOLUTION_FUZZY_THRESHOLD=0.85 +# ENTITY_RESOLUTION_VECTOR_THRESHOLD=0.5 +# ENTITY_RESOLUTION_MAX_CANDIDATES=3 + ### Chunk size for document splitting, 500~1500 is recommended # CHUNK_SIZE=1200 # CHUNK_OVERLAP_SIZE=100 diff --git a/examples/test_entity_resolution.py b/examples/test_entity_resolution.py new file mode 100644 index 00000000..b3fa2dd0 --- /dev/null +++ b/examples/test_entity_resolution.py @@ -0,0 +1,93 @@ +""" +Quick test for Entity Resolution feature. + +Tests that: +1. "FDA" and "US Food and Drug Administration" resolve to the same entity +2. "Dupixant" (typo) matches "Dupixent" via fuzzy matching +""" + +import asyncio +import os +import shutil + +from lightrag import LightRAG +from lightrag.entity_resolution import EntityResolutionConfig +from lightrag.llm.openai import gpt_4o_mini_complete, openai_embed +from lightrag.utils import logger +import logging + +WORKING_DIR = "./test_entity_resolution" + +# Test document with entities that should be deduplicated +TEST_DOC = """ +The FDA approved Dupixent for treating eczema in 2017. +The US Food and Drug Administration later expanded the drug's indications. +Dupixant (sometimes misspelled) has shown good results in clinical trials. +The FDA continues to monitor the safety of Dupixent. +""" + + +async def main(): + if not os.getenv("OPENAI_API_KEY"): + print("Error: Set OPENAI_API_KEY environment variable") + return + + # Clean up previous test + if os.path.exists(WORKING_DIR): + shutil.rmtree(WORKING_DIR) + os.makedirs(WORKING_DIR) + + # Set up logging to see resolution messages + logging.basicConfig(level=logging.DEBUG) + logger.setLevel(logging.DEBUG) + + print("\n" + "=" * 60) + print("Entity Resolution Test") + print("=" * 60) + + rag = LightRAG( + working_dir=WORKING_DIR, + embedding_func=openai_embed, + llm_model_func=gpt_4o_mini_complete, + entity_resolution_config=EntityResolutionConfig( + enabled=True, + fuzzy_threshold=0.85, + vector_threshold=0.5, + max_candidates=3, + ), + ) + + await rag.initialize_storages() + + print("\nInserting test document...") + print(f"Document: {TEST_DOC.strip()}") + print("\n" + "-" * 60) + + await rag.ainsert(TEST_DOC) + + print("\n" + "-" * 60) + print("Checking extracted entities...") + + # Read the graph to see what entities were created + graph_file = os.path.join(WORKING_DIR, "graph_chunk_entity_relation.graphml") + if os.path.exists(graph_file): + import networkx as nx + + G = nx.read_graphml(graph_file) + print(f"\nEntities in graph ({len(G.nodes())} total):") + for node in sorted(G.nodes()): + print(f" - {node}") + + print(f"\nRelationships: {len(G.edges())}") + else: + print("Graph file not found") + + await rag.finalize_storages() + + print("\n" + "=" * 60) + print("Test complete!") + print("=" * 60) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/lightrag/api/config.py b/lightrag/api/config.py index 4f59d3c1..2750095a 100644 --- a/lightrag/api/config.py +++ b/lightrag/api/config.py @@ -450,6 +450,20 @@ def parse_args() -> argparse.Namespace: "EMBEDDING_TOKEN_LIMIT", None, int, special_none=True ) + # Entity Resolution configuration + args.entity_resolution_enabled = get_env_value( + "ENTITY_RESOLUTION_ENABLED", False, bool + ) + args.entity_resolution_fuzzy_threshold = get_env_value( + "ENTITY_RESOLUTION_FUZZY_THRESHOLD", 0.85, float + ) + args.entity_resolution_vector_threshold = get_env_value( + "ENTITY_RESOLUTION_VECTOR_THRESHOLD", 0.5, float + ) + args.entity_resolution_max_candidates = get_env_value( + "ENTITY_RESOLUTION_MAX_CANDIDATES", 3, int + ) + ollama_server_infos.LIGHTRAG_NAME = args.simulated_model_name ollama_server_infos.LIGHTRAG_TAG = args.simulated_model_tag diff --git a/lightrag/api/lightrag_server.py b/lightrag/api/lightrag_server.py index a8a14c66..f6052dcc 100644 --- a/lightrag/api/lightrag_server.py +++ b/lightrag/api/lightrag_server.py @@ -2,6 +2,8 @@ LightRAG FastAPI Server """ +from __future__ import annotations + from fastapi import FastAPI, Depends, HTTPException, Request from fastapi.exceptions import RequestValidationError from fastapi.responses import JSONResponse @@ -642,6 +644,23 @@ def create_app(args): raise Exception(f"Failed to import {binding} options: {e}") return {} + def create_entity_resolution_config(args) -> object | None: + """ + Create EntityResolutionConfig from command line/env arguments. + Returns None if entity resolution is disabled. + """ + if not args.entity_resolution_enabled: + return None + + from lightrag.entity_resolution import EntityResolutionConfig + + return EntityResolutionConfig( + enabled=True, + fuzzy_threshold=args.entity_resolution_fuzzy_threshold, + vector_threshold=args.entity_resolution_vector_threshold, + max_candidates=args.entity_resolution_max_candidates, + ) + def create_optimized_embedding_function( config_cache: LLMConfigCache, binding, model, host, api_key, args ) -> EmbeddingFunc: @@ -1029,6 +1048,7 @@ def create_app(args): "entity_types": args.entity_types, }, ollama_server_infos=ollama_server_infos, + entity_resolution_config=create_entity_resolution_config(args), ) except Exception as e: logger.error(f"Failed to initialize LightRAG: {e}") diff --git a/lightrag/api/routers/document_routes.py b/lightrag/api/routers/document_routes.py index 85183bbd..c185c1b3 100644 --- a/lightrag/api/routers/document_routes.py +++ b/lightrag/api/routers/document_routes.py @@ -443,7 +443,7 @@ class DocStatusResponse(BaseModel): metadata: Optional[dict[str, Any]] = Field( default=None, description="Additional metadata about the document" ) - file_path: str = Field(description="Path to the document file") + file_path: Optional[str] = Field(default=None, description="Path to the document file") class Config: json_schema_extra = { diff --git a/lightrag/entity_resolution/__init__.py b/lightrag/entity_resolution/__init__.py new file mode 100644 index 00000000..a32888c4 --- /dev/null +++ b/lightrag/entity_resolution/__init__.py @@ -0,0 +1,29 @@ +""" +Entity Resolution Module for LightRAG + +Provides automatic entity deduplication using a 3-layer approach: +1. Case normalization (exact match) +2. Fuzzy string matching (typos) +3. Vector similarity + LLM verification (semantic matches) +""" + +from .resolver import ( + resolve_entity, + resolve_entity_with_vdb, + ResolutionResult, + get_cached_alias, + store_alias, + fuzzy_similarity, +) +from .config import EntityResolutionConfig, DEFAULT_CONFIG + +__all__ = [ + "resolve_entity", + "resolve_entity_with_vdb", + "ResolutionResult", + "EntityResolutionConfig", + "DEFAULT_CONFIG", + "get_cached_alias", + "store_alias", + "fuzzy_similarity", +] diff --git a/lightrag/entity_resolution/config.py b/lightrag/entity_resolution/config.py new file mode 100644 index 00000000..472ca9d4 --- /dev/null +++ b/lightrag/entity_resolution/config.py @@ -0,0 +1,57 @@ +"""Configuration for Entity Resolution + +Uses the same LLM that LightRAG is configured with - no separate model config needed. +""" + +from dataclasses import dataclass, field + + +@dataclass +class EntityResolutionConfig: + """Configuration for the entity resolution system.""" + + # Whether entity resolution is enabled + enabled: bool = True + + # Fuzzy pre-resolution: Enable/disable within-batch fuzzy matching before + # VDB lookup. When enabled, entities in the same batch are matched by string + # similarity alone. Set to False to skip fuzzy pre-resolution entirely (only + # exact case-insensitive matches will be accepted within batch; all other + # resolution goes to VDB/LLM). Disabling reduces false positives but may + # miss obvious typo corrections. + fuzzy_pre_resolution_enabled: bool = True + + # Fuzzy string matching threshold (0-1) + # Above this = auto-match (catches typos like Dupixant/Dupixent at 0.88) + # Below this = continue to vector search + # Tuning advice: + # 0.90+ = Very conservative, near-identical strings (Dupixent/Dupixant) + # 0.85 = Balanced default, catches typos, avoids most false positives + # 0.80 = Aggressive, may merge distinct entities with similar names + # <0.75 = Not recommended, high false positive risk (Celebrex/Cerebyx=0.67) + # Test with your domain data; pharmaceutical names need higher thresholds. + fuzzy_threshold: float = 0.85 + + # Vector similarity threshold for finding candidates + # Low threshold = cast wide net, LLM will verify + # 0.5 catches FDA/US Food and Drug Administration at 0.67 + vector_threshold: float = 0.5 + + # Maximum number of vector candidates to verify with LLM + # Limits cost - uses same LLM as LightRAG main config + max_candidates: int = 3 + + # LLM verification prompt template + llm_prompt_template: str = field( + default="""Are these two terms referring to the same entity? +Consider typos, misspellings, abbreviations, or alternate names. + +Term A: {term_a} +Term B: {term_b} + +Answer only YES or NO.""", + ) + + +# Default configuration +DEFAULT_CONFIG = EntityResolutionConfig() diff --git a/lightrag/entity_resolution/resolver.py b/lightrag/entity_resolution/resolver.py new file mode 100644 index 00000000..8dab65dd --- /dev/null +++ b/lightrag/entity_resolution/resolver.py @@ -0,0 +1,335 @@ +"""Entity Resolution - 3-Layer Approach + +Layer 1: Case normalization (exact match) +Layer 2: Fuzzy string matching (>85% = typos) +Layer 3: Vector similarity + LLM verification (semantic matches) + +Uses the same LLM that LightRAG is configured with. +""" + +from collections.abc import Awaitable, Callable +from dataclasses import dataclass +from difflib import SequenceMatcher + +import numpy as np + +from lightrag.utils import logger +from .config import DEFAULT_CONFIG, EntityResolutionConfig + + +@dataclass +class ResolutionResult: + """Result of entity resolution attempt.""" + + action: str # "match" | "new" + matched_entity: str | None + confidence: float + method: str # "exact" | "fuzzy" | "llm" | "none" | "disabled" + + +def cosine_similarity(a: list[float], b: list[float]) -> float: + """Calculate cosine similarity between two vectors.""" + a_arr, b_arr = np.array(a), np.array(b) + norm_a, norm_b = np.linalg.norm(a_arr), np.linalg.norm(b_arr) + if norm_a == 0 or norm_b == 0: + return 0.0 + return float(np.dot(a_arr, b_arr) / (norm_a * norm_b)) + + +def fuzzy_similarity(a: str, b: str) -> float: + """Calculate fuzzy string similarity (0-1).""" + return SequenceMatcher(None, a.lower().strip(), b.lower().strip()).ratio() + + +def find_vector_candidates( + query_embedding: list[float], + existing_entities: list[tuple[str, list[float]]], + threshold: float, +) -> list[tuple[str, float]]: + """Find entities with vector similarity above threshold.""" + candidates = [] + for name, embedding in existing_entities: + sim = cosine_similarity(query_embedding, embedding) + if sim >= threshold: + candidates.append((name, sim)) + # Sort by similarity descending + candidates.sort(key=lambda x: x[1], reverse=True) + return candidates + + +async def llm_verify( + term_a: str, + term_b: str, + llm_fn: Callable[[str], Awaitable[str]], + prompt_template: str, +) -> bool: + """Ask LLM if two terms refer to the same entity. + + Uses strict parsing with exact token matching only. Accepted responses: + - Positive: "YES", "TRUE", "SAME", "MATCH" + - Negative: "NO", "FALSE", "DIFFERENT", "NOT SAME" + Any other response defaults to False to avoid false positive merges. + """ + prompt = prompt_template.format(term_a=term_a, term_b=term_b) + response = await llm_fn(prompt) + + # Normalize response: strip whitespace, take first line only + normalized = response.strip().split("\n")[0].strip().upper() + + # Remove common trailing punctuation + normalized = normalized.rstrip(".!,") + + # Only accept exact tokens (no prefix/substring matching) + if normalized in ("YES", "TRUE", "SAME", "MATCH"): + return True + if normalized in ("NO", "FALSE", "DIFFERENT", "NOT SAME"): + return False + + # Default to False for ambiguous responses (safer than false positive) + return False + + +async def resolve_entity( + entity_name: str, + existing_entities: list[tuple[str, list[float]]], + embed_fn: Callable[[str], Awaitable[list[float]]], + llm_fn: Callable[[str], Awaitable[str]], + config: EntityResolutionConfig = DEFAULT_CONFIG, +) -> ResolutionResult: + """Resolve an entity against existing entities using 3-layer approach. + + Args: + entity_name: The new entity name to resolve + existing_entities: List of (name, embedding) tuples for existing entities + embed_fn: Async function to get embedding for a string (same as LightRAG uses) + llm_fn: Async function to query LLM (same as LightRAG uses) + config: Resolution configuration + + Returns: + ResolutionResult with action ("match" or "new"), matched entity, + confidence, and method used. + """ + if not config.enabled: + return ResolutionResult("new", None, 0.0, "disabled") + + if not existing_entities: + return ResolutionResult("new", None, 0.0, "none") + + normalized = entity_name.lower().strip() + + # Layer 1: Case-insensitive exact match + for name, _ in existing_entities: + if name.lower().strip() == normalized: + return ResolutionResult("match", name, 1.0, "exact") + + # Layer 2: Fuzzy string matching (catches typos) + best_fuzzy_match = None + best_fuzzy_score = 0.0 + + for name, _ in existing_entities: + similarity = fuzzy_similarity(entity_name, name) + if similarity > best_fuzzy_score: + best_fuzzy_score = similarity + best_fuzzy_match = name + + if best_fuzzy_score >= config.fuzzy_threshold: + return ResolutionResult("match", best_fuzzy_match, best_fuzzy_score, "fuzzy") + + # Layer 3: Vector similarity + LLM verification + embedding = await embed_fn(entity_name) + candidates = find_vector_candidates( + embedding, + existing_entities, + config.vector_threshold, + ) + + # Verify top candidates with LLM + for candidate_name, similarity in candidates[: config.max_candidates]: + is_same = await llm_verify( + entity_name, + candidate_name, + llm_fn, + config.llm_prompt_template, + ) + if is_same: + return ResolutionResult("match", candidate_name, similarity, "llm") + + # No match found - this is a new entity + return ResolutionResult("new", None, 0.0, "none") + + +async def resolve_entity_with_vdb( + entity_name: str, + entity_vdb, # BaseVectorStorage - imported dynamically to avoid circular imports + llm_fn: Callable[[str], Awaitable[str]], + config: EntityResolutionConfig = DEFAULT_CONFIG, +) -> ResolutionResult: + """Resolve an entity using VDB for similarity search. + + This is the production integration that uses LightRAG's vector database + directly instead of requiring pre-computed embeddings. + + Args: + entity_name: The new entity name to resolve + entity_vdb: LightRAG's entity vector database (BaseVectorStorage) + llm_fn: Async function to query LLM (same as LightRAG uses) + config: Resolution configuration + + Returns: + ResolutionResult with action ("match" or "new"), matched entity, + confidence, and method used. + """ + if not config.enabled: + return ResolutionResult("new", None, 0.0, "disabled") + + if entity_vdb is None: + return ResolutionResult("new", None, 0.0, "none") + + normalized = entity_name.lower().strip() + + # Query VDB for similar entities - cast wide net, LLM will verify + # top_k is doubled to have enough candidates after filtering + try: + candidates = await entity_vdb.query( + entity_name, top_k=config.max_candidates * 3 + ) + except Exception as e: + # Log and skip resolution if VDB query fails + logger.debug(f"VDB query failed for '{entity_name}': {e}") + return ResolutionResult("new", None, 0.0, "none") + + if not candidates: + return ResolutionResult("new", None, 0.0, "none") + + # Layer 1: Case-insensitive exact match among candidates + for candidate in candidates: + candidate_name = candidate.get("entity_name") + if candidate_name and candidate_name.lower().strip() == normalized: + return ResolutionResult("match", candidate_name, 1.0, "exact") + + # Layer 2: Fuzzy string matching (catches typos) + best_fuzzy_match = None + best_fuzzy_score = 0.0 + + for candidate in candidates: + candidate_name = candidate.get("entity_name") + if not candidate_name: + continue + similarity = fuzzy_similarity(entity_name, candidate_name) + if similarity > best_fuzzy_score: + best_fuzzy_score = similarity + best_fuzzy_match = candidate_name + + if best_fuzzy_score >= config.fuzzy_threshold: + return ResolutionResult("match", best_fuzzy_match, best_fuzzy_score, "fuzzy") + + # Layer 3: LLM verification on top candidates + verified_count = 0 + for candidate in candidates: + if verified_count >= config.max_candidates: + break + candidate_name = candidate.get("entity_name") + if not candidate_name: + continue + + is_same = await llm_verify( + entity_name, + candidate_name, + llm_fn, + config.llm_prompt_template, + ) + verified_count += 1 + + if is_same: + # Use distance from VDB if available (converted to similarity) + similarity = 0.7 # Default confidence for LLM match + return ResolutionResult("match", candidate_name, similarity, "llm") + + # No match found - this is a new entity + return ResolutionResult("new", None, 0.0, "none") + + +# --- Alias Cache Functions (PostgreSQL) --- + + +async def get_cached_alias( + alias: str, + db, # PostgresDB instance + workspace: str, +) -> tuple[str, str, float] | None: + """Check if alias is already resolved in cache. + + Args: + alias: The entity name to look up + db: PostgresDB instance with query method + workspace: Workspace for isolation + + Returns: + Tuple of (canonical_entity, method, confidence) if found, None otherwise + """ + import logging + + from lightrag.kg.postgres_impl import SQL_TEMPLATES + + logger = logging.getLogger(__name__) + normalized_alias = alias.lower().strip() + + sql = SQL_TEMPLATES["get_alias"] + try: + result = await db.query(sql, params=[workspace, normalized_alias]) + if result: + return ( + result["canonical_entity"], + result["method"], + result["confidence"], + ) + except Exception as e: + logger.debug(f"Alias cache lookup error: {e}") + return None + + +async def store_alias( + alias: str, + canonical: str, + method: str, + confidence: float, + db, # PostgresDB instance + workspace: str, +) -> None: + """Store a resolution in the alias cache. + + Args: + alias: The variant name (e.g., "FDA") + canonical: The resolved canonical name (e.g., "US Food and Drug Administration") + method: How it was resolved ('exact', 'fuzzy', 'llm', 'manual') + confidence: Resolution confidence (0-1) + db: PostgresDB instance with execute method + workspace: Workspace for isolation + """ + import logging + from datetime import datetime, timezone + + from lightrag.kg.postgres_impl import SQL_TEMPLATES + + logger = logging.getLogger(__name__) + normalized_alias = alias.lower().strip() + + # Don't store self-referential aliases (e.g., "FDA" → "FDA") + if normalized_alias == canonical.lower().strip(): + return + + sql = SQL_TEMPLATES["upsert_alias"] + try: + await db.execute( + sql, + data={ + "workspace": workspace, + "alias": normalized_alias, + "canonical_entity": canonical, + "method": method, + "confidence": confidence, + "create_time": datetime.now(timezone.utc).replace(tzinfo=None), + }, + ) + except Exception as e: + logger.debug(f"Alias cache store error: {e}") diff --git a/lightrag/kg/postgres_impl.py b/lightrag/kg/postgres_impl.py index 1447a79e..97ab8d13 100644 --- a/lightrag/kg/postgres_impl.py +++ b/lightrag/kg/postgres_impl.py @@ -1130,7 +1130,14 @@ class PostgreSQLDB: existing_indexes = {row["indexname"] for row in existing_indexes_result} # Create missing indexes + # Tables that don't have an 'id' column (use different primary key structure) + tables_without_id = {"LIGHTRAG_ENTITY_ALIASES"} + for k in table_names: + # Skip tables that don't have an 'id' column + if k in tables_without_id: + continue + # Create index for id column if missing index_name = f"idx_{k.lower()}_id" if index_name not in existing_indexes: @@ -1259,6 +1266,12 @@ class PostgreSQLDB: f"PostgreSQL, Failed to create full entities/relations tables: {e}" ) + # Migrate entity aliases table to add update_time, index on canonical_entity, and confidence constraint + try: + await self._migrate_entity_aliases_schema() + except Exception as e: + logger.error(f"PostgreSQL, Failed to migrate entity aliases schema: {e}") + async def _migrate_create_full_entities_relations_tables(self): """Create LIGHTRAG_FULL_ENTITIES and LIGHTRAG_FULL_RELATIONS tables if they don't exist""" tables_to_check = [ @@ -1323,6 +1336,127 @@ class PostgreSQLDB: except Exception as e: logger.error(f"Failed to create table {table_name}: {e}") + async def _migrate_entity_aliases_schema(self): + """Migrate LIGHTRAG_ENTITY_ALIASES table to add update_time column, canonical index, and confidence constraint""" + table_name = "LIGHTRAG_ENTITY_ALIASES" + + # Check if table exists first + check_table_sql = """ + SELECT table_name + FROM information_schema.tables + WHERE table_name = $1 + AND table_schema = 'public' + """ + table_exists = await self.query(check_table_sql, [table_name.lower()]) + if not table_exists: + logger.debug(f"Table {table_name} does not exist yet, skipping migration") + return + + # 1. Add update_time column if it doesn't exist + check_column_sql = """ + SELECT column_name + FROM information_schema.columns + WHERE table_name = $1 + AND column_name = 'update_time' + AND table_schema = 'public' + """ + column_exists = await self.query(check_column_sql, [table_name.lower()]) + + if not column_exists: + try: + # Three-step migration to add update_time column: + # 1. Add column WITHOUT default - avoids full table rewrite on large tables + # 2. Backfill existing rows with create_time values + # 3. Set DEFAULT for future inserts + # Note: There's a tiny race window between steps 1-3 where concurrent + # inserts could get NULL. This is acceptable for this migration use case. + # + # Step 1: Add column WITHOUT default (existing rows get NULL) + add_column_sql = f""" + ALTER TABLE {table_name} + ADD COLUMN update_time TIMESTAMP(0) + """ + await self.execute(add_column_sql) + logger.info(f"PostgreSQL, Added update_time column to {table_name}") + + # Step 2: Set existing rows' update_time to their create_time + update_sql = f""" + UPDATE {table_name} + SET update_time = create_time + WHERE update_time IS NULL + """ + await self.execute(update_sql) + logger.info( + f"PostgreSQL, Initialized update_time values in {table_name}" + ) + + # Step 3: Set default for future rows + set_default_sql = f""" + ALTER TABLE {table_name} + ALTER COLUMN update_time SET DEFAULT CURRENT_TIMESTAMP + """ + await self.execute(set_default_sql) + logger.info( + f"PostgreSQL, Set default for update_time column in {table_name}" + ) + except Exception as e: + logger.error( + f"PostgreSQL, Failed to add update_time column to {table_name}: {e}" + ) + + # 2. Create index on (workspace, canonical_entity) for get_aliases_for_canonical query + index_name = "idx_lightrag_entity_aliases_canonical" + check_index_sql = """ + SELECT indexname + FROM pg_indexes + WHERE tablename = $1 + AND indexname = $2 + """ + index_exists = await self.query( + check_index_sql, [table_name.lower(), index_name] + ) + + if not index_exists: + try: + create_index_sql = f""" + CREATE INDEX {index_name} + ON {table_name} (workspace, canonical_entity) + """ + await self.execute(create_index_sql) + logger.info(f"PostgreSQL, Created index {index_name} on {table_name}") + except Exception as e: + logger.error(f"PostgreSQL, Failed to create index {index_name}: {e}") + + # 3. Add CHECK constraint for confidence range if it doesn't exist + constraint_name = "confidence_range" + check_constraint_sql = """ + SELECT constraint_name + FROM information_schema.table_constraints + WHERE table_name = $1 + AND constraint_name = $2 + AND constraint_type = 'CHECK' + AND table_schema = 'public' + """ + constraint_exists = await self.query( + check_constraint_sql, [table_name.lower(), constraint_name] + ) + + if not constraint_exists: + try: + add_constraint_sql = f""" + ALTER TABLE {table_name} + ADD CONSTRAINT {constraint_name} + CHECK (confidence >= 0 AND confidence <= 1) + """ + await self.execute(add_constraint_sql) + logger.info( + f"PostgreSQL, Added CHECK constraint {constraint_name} to {table_name}" + ) + except Exception as e: + logger.warning( + f"PostgreSQL, Failed to add CHECK constraint {constraint_name} to {table_name}: {e}" + ) + async def _create_pagination_indexes(self): """Create indexes to optimize pagination queries for LIGHTRAG_DOC_STATUS""" indexes = [ @@ -1402,7 +1536,7 @@ class PostgreSQLDB: "VCHORDRQ": f""" CREATE INDEX {{vector_index_name}} ON {{k}} USING vchordrq (content_vector vector_cosine_ops) - {f'WITH (options = $${self.vchordrq_build_options}$$)' if self.vchordrq_build_options else ''} + {f"WITH (options = $${self.vchordrq_build_options}$$)" if self.vchordrq_build_options else ""} """, } @@ -4906,6 +5040,19 @@ TABLES = { CONSTRAINT LIGHTRAG_RELATION_CHUNKS_PK PRIMARY KEY (workspace, id) )""" }, + "LIGHTRAG_ENTITY_ALIASES": { + "ddl": """CREATE TABLE LIGHTRAG_ENTITY_ALIASES ( + workspace VARCHAR(255), + alias VARCHAR(512), + canonical_entity VARCHAR(512), + method VARCHAR(50), + confidence FLOAT, + create_time TIMESTAMP(0) DEFAULT CURRENT_TIMESTAMP, + update_time TIMESTAMP(0) DEFAULT CURRENT_TIMESTAMP, + CONSTRAINT LIGHTRAG_ENTITY_ALIASES_PK PRIMARY KEY (workspace, alias), + CONSTRAINT confidence_range CHECK (confidence >= 0 AND confidence <= 1) + )""" + }, } @@ -5117,4 +5264,25 @@ SQL_TEMPLATES = { "drop_specifiy_table_workspace": """ DELETE FROM {table_name} WHERE workspace=$1 """, + # Entity alias cache + "get_alias": """ + SELECT canonical_entity, method, confidence + FROM LIGHTRAG_ENTITY_ALIASES + WHERE workspace=$1 AND alias=$2 + """, + "upsert_alias": """ + INSERT INTO LIGHTRAG_ENTITY_ALIASES + (workspace, alias, canonical_entity, method, confidence, create_time, update_time) + VALUES ($1, $2, $3, $4, $5, $6, $6) + ON CONFLICT (workspace, alias) DO UPDATE SET + canonical_entity = EXCLUDED.canonical_entity, + method = EXCLUDED.method, + confidence = EXCLUDED.confidence, + update_time = CURRENT_TIMESTAMP + """, + "get_aliases_for_canonical": """ + SELECT alias, method, confidence + FROM LIGHTRAG_ENTITY_ALIASES + WHERE workspace=$1 AND canonical_entity=$2 + """, } diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 8a638759..091ad89a 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -26,6 +26,7 @@ from typing import ( ) from lightrag.prompt import PROMPTS from lightrag.exceptions import PipelineCancelledException +from lightrag.entity_resolution import EntityResolutionConfig from lightrag.constants import ( DEFAULT_MAX_GLEANING, DEFAULT_FORCE_LLM_SUMMARY_ON_MERGE, @@ -217,6 +218,11 @@ class LightRAG: ) ) + entity_resolution_config: EntityResolutionConfig | None = field(default=None) + """Configuration for entity resolution (deduplication). + Set to EntityResolutionConfig() to enable, or None to disable. + Resolves entities like 'FDA' → 'US Food and Drug Administration'.""" + # Text chunking # --- diff --git a/lightrag/operate.py b/lightrag/operate.py index c6724974..acdef5b5 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -5,6 +5,7 @@ from pathlib import Path import asyncio import json import json_repair +import re from typing import Any, AsyncIterator, overload, Literal from collections import Counter, defaultdict @@ -50,6 +51,13 @@ from lightrag.base import ( QueryContextResult, ) from lightrag.prompt import PROMPTS +from lightrag.entity_resolution import ( + resolve_entity_with_vdb, + get_cached_alias, + store_alias, + fuzzy_similarity, + EntityResolutionConfig, +) from lightrag.constants import ( GRAPH_FIELD_SEP, DEFAULT_MAX_ENTITY_TOKENS, @@ -1590,6 +1598,180 @@ async def _rebuild_single_relationship( pipeline_status["history_messages"].append(status_message) +def _has_different_numeric_suffix(name_a: str, name_b: str) -> bool: + """Check if two names have different numeric components. + + This prevents false fuzzy matches between entities that differ only by number, + such as "Interleukin-4" vs "Interleukin-13" (88.9% similar but semantically distinct). + + Scientific/medical entities often use numbers as key identifiers: + - Interleukins: IL-4, IL-13, IL-17 + - Drug phases: Phase 1, Phase 2, Phase 3 + - Receptor types: Type 1, Type 2 + - Versions: v1.0, v2.0 + + Args: + name_a: First entity name + name_b: Second entity name + + Returns: + True if both names contain numbers but the numbers differ, False otherwise. + """ + # Extract all numeric patterns (integers and decimals) + pattern = r"(\d+(?:\.\d+)?)" + nums_a = re.findall(pattern, name_a) + nums_b = re.findall(pattern, name_b) + + # If both have numbers and they differ, these are likely distinct entities + if nums_a and nums_b and nums_a != nums_b: + return True + return False + + +async def _build_pre_resolution_map( + entity_names: list[str], + entity_types: dict[str, str], + entity_vdb, + llm_fn, + config: EntityResolutionConfig, +) -> tuple[dict[str, str], dict[str, float]]: + """Build resolution map before parallel processing to prevent race conditions. + + This function resolves entities against each other within the batch (using + instant fuzzy matching) and against existing VDB entries. The resulting map + is applied during parallel entity processing. + + Args: + entity_names: List of entity names to resolve + entity_types: Dict mapping entity names to their types (e.g., "person", "organization"). + Used to prevent fuzzy matching between entities of different types. + entity_vdb: Entity vector database for checking existing entities + llm_fn: LLM function for semantic verification + config: Entity resolution configuration + + Returns: + Tuple of: + - resolution_map: Dict mapping original entity names to their resolved canonical names. + Only entities that need remapping are included. + - confidence_map: Dict mapping alias to confidence score (1.0 for exact, actual + similarity for fuzzy, result.confidence for VDB matches). + """ + resolution_map: dict[str, str] = {} + confidence_map: dict[str, float] = {} + # Track canonical entities with their types: [(name, type), ...] + canonical_entities: list[tuple[str, str]] = [] + + for entity_name in entity_names: + normalized = entity_name.lower().strip() + entity_type = entity_types.get(entity_name, "") + + # Skip if already resolved to something in this batch + if entity_name in resolution_map: + continue + + # Layer 1: Case-insensitive exact match within batch + matched = False + for canonical, canonical_type in canonical_entities: + if canonical.lower().strip() == normalized: + resolution_map[entity_name] = canonical + confidence_map[entity_name] = 1.0 # Exact match = perfect confidence + logger.debug( + f"Pre-resolution (case match): '{entity_name}' → '{canonical}'" + ) + matched = True + break + + if matched: + continue + + # Layer 2: Fuzzy match within batch (catches typos like Dupixant→Dupixent) + # Only enabled when config.fuzzy_pre_resolution_enabled is True. + # Requires: similarity >= threshold AND matching types (or unknown). + if config.fuzzy_pre_resolution_enabled: + for canonical, canonical_type in canonical_entities: + similarity = fuzzy_similarity(entity_name, canonical) + if similarity >= config.fuzzy_threshold: + # Type compatibility check: skip if types differ and both known. + # Empty/unknown types are treated as compatible to avoid + # blocking legitimate matches when type info is incomplete. + types_compatible = ( + not entity_type + or not canonical_type + or entity_type == canonical_type + ) + if not types_compatible: + logger.debug( + f"Pre-resolution (fuzzy {similarity:.2f}): SKIPPED " + f"'{entity_name}' ({entity_type}) → " + f"'{canonical}' ({canonical_type}) - type mismatch" + ) + continue + + # Numeric suffix check: skip if names have different numbers + # This prevents false matches like "Interleukin-4" → "Interleukin-13" + # where fuzzy similarity is high (88.9%) but entities are distinct + if _has_different_numeric_suffix(entity_name, canonical): + logger.debug( + f"Pre-resolution (fuzzy {similarity:.2f}): SKIPPED " + f"'{entity_name}' → '{canonical}' - different numeric suffix" + ) + continue + + # Accept the fuzzy match - emit warning for review + resolution_map[entity_name] = canonical + confidence_map[entity_name] = ( + similarity # Use actual similarity score + ) + etype_display = entity_type or "unknown" + ctype_display = canonical_type or "unknown" + logger.warning( + f"Fuzzy pre-resolution accepted: '{entity_name}' → " + f"'{canonical}' (similarity={similarity:.3f}, " + f"types: {etype_display}→{ctype_display}). " + f"Review for correctness; adjust fuzzy_threshold or " + f"disable fuzzy_pre_resolution_enabled if needed." + ) + matched = True + break + + if matched: + continue + + # Layer 3: Check existing VDB for cross-document deduplication + if entity_vdb and llm_fn: + try: + result = await resolve_entity_with_vdb( + entity_name, entity_vdb, llm_fn, config + ) + if result.action == "match" and result.matched_entity: + resolution_map[entity_name] = result.matched_entity + confidence_map[entity_name] = ( + result.confidence + ) # Use VDB result confidence + # Add canonical from VDB so batch entities can match it. + # VDB matches don't have type info available, use empty. + canonical_entities.append((result.matched_entity, "")) + logger.debug( + f"Pre-resolution (VDB {result.method}): " + f"'{entity_name}' → '{result.matched_entity}'" + ) + continue + except Exception as e: + logger.debug( + f"Pre-resolution VDB check failed for '{entity_name}': {e}" + ) + + # No match found - this is a new canonical entity + canonical_entities.append((entity_name, entity_type)) + + if resolution_map: + logger.info( + f"Pre-resolution: {len(resolution_map)} entities mapped to canonical forms" + ) + + return resolution_map, confidence_map + + async def _merge_nodes_then_upsert( entity_name: str, nodes_data: list[dict], @@ -1600,8 +1782,128 @@ async def _merge_nodes_then_upsert( pipeline_status_lock=None, llm_response_cache: BaseKVStorage | None = None, entity_chunks_storage: BaseKVStorage | None = None, -): - """Get existing nodes from knowledge graph use name,if exists, merge data, else create, then upsert.""" + pre_resolution_map: dict[str, str] | None = None, +) -> tuple[dict, str | None]: + """Get existing nodes from knowledge graph use name,if exists, merge data, else create, then upsert. + + Returns: + Tuple of (node_data, original_entity_name). original_entity_name is set if + entity resolution changed the name (e.g., "Dupixant" → "Dupixent"), + otherwise None. + """ + original_entity_name = entity_name # Track original before resolution + + # Apply pre-resolution map immediately (prevents race conditions in parallel processing) + pre_resolved = False + if pre_resolution_map and entity_name in pre_resolution_map: + entity_name = pre_resolution_map[entity_name] + pre_resolved = True + logger.debug( + f"Applied pre-resolution: '{original_entity_name}' → '{entity_name}'" + ) + + # Entity Resolution: Resolve new entity against existing entities + # Skip if already pre-resolved (to avoid redundant VDB queries) + entity_resolution_config_raw = global_config.get("entity_resolution_config") + entity_resolution_config = None + if entity_resolution_config_raw: + # Handle both dict (from asdict() serialization) and EntityResolutionConfig instances + if isinstance(entity_resolution_config_raw, EntityResolutionConfig): + entity_resolution_config = entity_resolution_config_raw + elif isinstance(entity_resolution_config_raw, dict): + try: + entity_resolution_config = EntityResolutionConfig( + **entity_resolution_config_raw + ) + except TypeError as e: + logger.warning( + f"Invalid entity_resolution_config: {e}. " + f"Config: {entity_resolution_config_raw}. Skipping resolution." + ) + + # Safely check if entity resolution is enabled, handling both object and dict forms + def _is_resolution_enabled(config) -> bool: + if config is None: + return False + if isinstance(config, dict): + return config.get("enabled", False) + return getattr(config, "enabled", False) + + # Skip VDB resolution if entity was already pre-resolved (prevents redundant queries) + if ( + _is_resolution_enabled(entity_resolution_config) + and entity_vdb + and not pre_resolved + ): + original_name = entity_name + workspace = global_config.get("workspace", "") + # Try knowledge_graph_inst.db first (more reliable), fallback to entity_vdb.db + db = getattr(knowledge_graph_inst, "db", None) or getattr( + entity_vdb, "db", None + ) + + # Layer 0: Check alias cache first (PostgreSQL-only - requires db connection) + # Note: Alias caching is only available when using PostgreSQL storage backend + if db is not None: + try: + cached = await get_cached_alias(original_name, db, workspace) + if cached: + canonical, method, _ = cached + logger.debug( + f"Alias cache hit: '{original_name}' → '{canonical}' " + f"(method: {method})" + ) + entity_name = canonical + except Exception as e: + logger.warning( + f"Entity resolution cache lookup failed for '{original_name}' " + f"(workspace: {workspace}): {type(e).__name__}: {e}. " + "Continuing without cache." + ) + + # Layers 1-3: Full VDB resolution (if not found in cache) + if entity_name == original_name: + llm_fn = global_config.get("llm_model_func") + if llm_fn: + try: + resolution = await resolve_entity_with_vdb( + entity_name, + entity_vdb, + llm_fn, + entity_resolution_config, + ) + if resolution.action == "match" and resolution.matched_entity: + logger.info( + f"Entity resolution: '{entity_name}' → '{resolution.matched_entity}' " + f"(method: {resolution.method}, confidence: {resolution.confidence:.2f})" + ) + entity_name = resolution.matched_entity + + # Store in alias cache for next time (PostgreSQL-only) + # Note: Alias caching requires PostgreSQL storage backend + if db is not None: + try: + await store_alias( + original_name, + entity_name, + resolution.method, + resolution.confidence, + db, + workspace, + ) + except Exception as e: + logger.warning( + f"Failed to store entity alias '{original_name}' → '{entity_name}' " + f"(workspace: {workspace}): {type(e).__name__}: {e}. " + "Resolution succeeded but cache not updated." + ) + except Exception as e: + logger.warning( + f"Entity resolution failed for '{original_name}' " + f"(workspace: {workspace}): {type(e).__name__}: {e}. " + "Continuing with original entity name." + ) + already_entity_types = [] already_source_ids = [] already_description = [] @@ -1865,7 +2167,12 @@ async def _merge_nodes_then_upsert( max_retries=3, retry_delay=0.1, ) - return node_data + + # Return original name if resolution changed it, None otherwise + resolved_from = ( + original_entity_name if entity_name != original_entity_name else None + ) + return node_data, resolved_from async def _merge_edges_then_upsert( @@ -1882,7 +2189,13 @@ async def _merge_edges_then_upsert( added_entities: list = None, # New parameter to track entities added during edge processing relation_chunks_storage: BaseKVStorage | None = None, entity_chunks_storage: BaseKVStorage | None = None, + entity_resolution_map: dict[str, str] | None = None, # Map original→resolved names ): + # Apply entity resolution mapping to edge endpoints + if entity_resolution_map: + src_id = entity_resolution_map.get(src_id, src_id) + tgt_id = entity_resolution_map.get(tgt_id, tgt_id) + if src_id == tgt_id: return None @@ -2472,6 +2785,76 @@ async def merge_nodes_and_edges( graph_max_async = global_config.get("llm_model_max_async", 4) * 2 semaphore = asyncio.Semaphore(graph_max_async) + # ===== Pre-Resolution Phase: Build entity resolution map ===== + # This prevents race conditions when parallel workers process similar entities + # IMPORTANT: Include BOTH entity names AND relation endpoints to catch all duplicates + pre_resolution_map: dict[str, str] = {} + entity_resolution_config_raw = global_config.get("entity_resolution_config") + if entity_resolution_config_raw: + # Handle both dict (from asdict() serialization) and EntityResolutionConfig instances + config = None + if isinstance(entity_resolution_config_raw, EntityResolutionConfig): + config = entity_resolution_config_raw + elif isinstance(entity_resolution_config_raw, dict): + try: + config = EntityResolutionConfig(**entity_resolution_config_raw) + except TypeError as e: + logger.warning( + f"Invalid entity_resolution_config: {e}. " + f"Config: {entity_resolution_config_raw}. Skipping resolution." + ) + if config and config.enabled: + llm_fn = global_config.get("llm_model_func") + # Build entity_types map for type-aware fuzzy matching. + # Use first non-empty type for entities with multiple occurrences. + entity_types: dict[str, str] = {} + for entity_name, entities in all_nodes.items(): + for entity_data in entities: + etype = entity_data.get("entity_type", "") + if etype: + entity_types[entity_name] = etype + break + + # Collect ALL entity names: from entities AND from relation endpoints + # This ensures relation endpoints like "EU Medicines Agency" get resolved + # against existing entities like "European Medicines Agency" + all_entity_names = set(all_nodes.keys()) + for src_id, tgt_id in all_edges.keys(): + all_entity_names.add(src_id) + all_entity_names.add(tgt_id) + + pre_resolution_map, confidence_map = await _build_pre_resolution_map( + list(all_entity_names), + entity_types, + entity_vdb, + llm_fn, + config, + ) + + # Cache pre-resolution aliases for future lookups (PostgreSQL-only) + # This ensures aliases discovered during batch processing are available + # for subsequent document ingestion without re-running resolution + db = getattr(knowledge_graph_inst, "db", None) + if db is not None and pre_resolution_map: + workspace = global_config.get("workspace", "") + for alias, canonical in pre_resolution_map.items(): + # Don't cache self-references (entity → itself) + if alias.lower().strip() != canonical.lower().strip(): + try: + await store_alias( + alias=alias, + canonical=canonical, + method="pre_resolution", + confidence=confidence_map.get(alias, 1.0), + db=db, + workspace=workspace, + ) + except Exception as e: + logger.debug( + f"Failed to cache pre-resolution alias " + f"'{alias}' → '{canonical}': {e}" + ) + # ===== Phase 1: Process all entities concurrently ===== log_message = f"Phase 1: Processing {total_entities_count} entities from {doc_id} (async: {graph_max_async})" logger.info(log_message) @@ -2479,6 +2862,11 @@ async def merge_nodes_and_edges( pipeline_status["latest_message"] = log_message pipeline_status["history_messages"].append(log_message) + # Resolution map to track original→resolved entity names (e.g., "Dupixant"→"Dupixent") + # This will be used to remap edge endpoints in Phase 2 + entity_resolution_map: dict[str, str] = {} + resolution_map_lock = asyncio.Lock() + async def _locked_process_entity_name(entity_name, entities): async with semaphore: # Check for cancellation before processing entity @@ -2496,7 +2884,7 @@ async def merge_nodes_and_edges( ): try: logger.debug(f"Processing entity {entity_name}") - entity_data = await _merge_nodes_then_upsert( + entity_data, resolved_from = await _merge_nodes_then_upsert( entity_name, entities, knowledge_graph_inst, @@ -2506,8 +2894,15 @@ async def merge_nodes_and_edges( pipeline_status_lock, llm_response_cache, entity_chunks_storage, + pre_resolution_map, ) + # Track resolution mapping for edge remapping in Phase 2 + if resolved_from is not None: + resolved_to = entity_data.get("entity_name", entity_name) + async with resolution_map_lock: + entity_resolution_map[resolved_from] = resolved_to + return entity_data except Exception as e: @@ -2617,6 +3012,7 @@ async def merge_nodes_and_edges( added_entities, # Pass list to collect added entities relation_chunks_storage, entity_chunks_storage, # Add entity_chunks_storage parameter + entity_resolution_map, # Apply entity resolution to edge endpoints ) if edge_data is None: @@ -2649,9 +3045,36 @@ async def merge_nodes_and_edges( raise prefixed_exception from e # Create relationship processing tasks - edge_tasks = [] + # Apply pre_resolution_map to edge endpoints to prevent duplicates from relation extraction + # Key fixes: sort for lock ordering, filter self-loops, deduplicate merged edges + resolved_edges: dict[tuple[str, str], list] = {} for edge_key, edges in all_edges.items(): - task = asyncio.create_task(_locked_process_edges(edge_key, edges)) + # Remap edge endpoints using pre-resolution map + # This catches cases like "EU Medicines Agency" → "European Medicines Agency" + resolved_src = pre_resolution_map.get(edge_key[0], edge_key[0]) + resolved_tgt = pre_resolution_map.get(edge_key[1], edge_key[1]) + + # Skip self-loops created by resolution (e.g., both endpoints resolve to same entity) + if resolved_src == resolved_tgt: + logger.debug( + f"Skipping self-loop after resolution: {edge_key} → ({resolved_src}, {resolved_tgt})" + ) + continue + + # Sort for consistent lock ordering (prevents deadlocks) + resolved_edge_key = tuple(sorted([resolved_src, resolved_tgt])) + + # Merge edges that resolve to same key (deduplication) + if resolved_edge_key not in resolved_edges: + resolved_edges[resolved_edge_key] = [] + resolved_edges[resolved_edge_key].extend(edges) + + # Create tasks from deduplicated edges + edge_tasks = [] + for resolved_edge_key, merged_edges in resolved_edges.items(): + task = asyncio.create_task( + _locked_process_edges(resolved_edge_key, merged_edges) + ) edge_tasks.append(task) # Execute relationship tasks with error handling diff --git a/lightrag_webui/src/App.tsx b/lightrag_webui/src/App.tsx index b8ae023d..83c5d10e 100644 --- a/lightrag_webui/src/App.tsx +++ b/lightrag_webui/src/App.tsx @@ -2,7 +2,6 @@ import { useState, useCallback, useEffect, useRef } from 'react' import ThemeProvider from '@/components/ThemeProvider' import TabVisibilityProvider from '@/contexts/TabVisibilityProvider' import ApiKeyAlert from '@/components/ApiKeyAlert' -import StatusIndicator from '@/components/status/StatusIndicator' import { SiteInfo, webuiPrefix } from '@/lib/constants' import { useBackendState, useAuthStore } from '@/stores/state' import { useSettingsStore } from '@/stores/settings' @@ -218,7 +217,6 @@ function App() { - {enableHealthCheck && } )} diff --git a/lightrag_webui/src/components/AppSettings.tsx b/lightrag_webui/src/components/AppSettings.tsx index 09312794..e7c27e90 100644 --- a/lightrag_webui/src/components/AppSettings.tsx +++ b/lightrag_webui/src/components/AppSettings.tsx @@ -1,9 +1,7 @@ -import { useState, useCallback } from 'react' -import { Popover, PopoverContent, PopoverTrigger } from '@/components/ui/Popover' +import { useCallback } from 'react' import Button from '@/components/ui/Button' -import { Select, SelectContent, SelectItem, SelectTrigger, SelectValue } from '@/components/ui/Select' import { useSettingsStore } from '@/stores/settings' -import { PaletteIcon } from 'lucide-react' +import { SunIcon, MoonIcon } from 'lucide-react' import { useTranslation } from 'react-i18next' import { cn } from '@/lib/utils' @@ -12,63 +10,39 @@ interface AppSettingsProps { } export default function AppSettings({ className }: AppSettingsProps) { - const [opened, setOpened] = useState(false) const { t } = useTranslation() - const language = useSettingsStore.use.language() - const setLanguage = useSettingsStore.use.setLanguage() - const theme = useSettingsStore.use.theme() const setTheme = useSettingsStore.use.setTheme() - const handleLanguageChange = useCallback((value: string) => { - setLanguage(value as 'en' | 'zh' | 'fr' | 'ar' | 'zh_TW') - }, [setLanguage]) + // Compute effective theme for icon/tooltip display when theme is 'system' + const effectiveTheme = theme === 'system' + ? (window.matchMedia('(prefers-color-scheme: dark)').matches ? 'dark' : 'light') + : theme - const handleThemeChange = useCallback((value: string) => { - setTheme(value as 'light' | 'dark' | 'system') - }, [setTheme]) + const handleThemeToggle = useCallback(() => { + if (theme === 'system') { + // Detect actual system preference and toggle to opposite + const isDark = window.matchMedia('(prefers-color-scheme: dark)').matches + setTheme(isDark ? 'light' : 'dark') + } else { + setTheme(theme === 'dark' ? 'light' : 'dark') + } + }, [theme, setTheme]) return ( - - - - - -
-
- - -
- -
- - -
-
-
-
+ ) } diff --git a/lightrag_webui/src/components/status/StatusIndicator.tsx b/lightrag_webui/src/components/status/StatusIndicator.tsx index 5a9fc751..d1add109 100644 --- a/lightrag_webui/src/components/status/StatusIndicator.tsx +++ b/lightrag_webui/src/components/status/StatusIndicator.tsx @@ -4,7 +4,7 @@ import { useEffect, useState } from 'react' import StatusDialog from './StatusDialog' import { useTranslation } from 'react-i18next' -const StatusIndicator = () => { +const StatusIndicator = ({ className }: { className?: string }) => { const { t } = useTranslation() const health = useBackendState.use.health() const lastCheckTime = useBackendState.use.lastCheckTime() @@ -20,7 +20,7 @@ const StatusIndicator = () => { }, [lastCheckTime]) return ( -
+
setDialogOpen(true)} diff --git a/lightrag_webui/src/features/SiteHeader.tsx b/lightrag_webui/src/features/SiteHeader.tsx index dbea38bd..c7ada3b2 100644 --- a/lightrag_webui/src/features/SiteHeader.tsx +++ b/lightrag_webui/src/features/SiteHeader.tsx @@ -1,13 +1,14 @@ import Button from '@/components/ui/Button' -import { SiteInfo, webuiPrefix } from '@/lib/constants' +import { webuiPrefix } from '@/lib/constants' import AppSettings from '@/components/AppSettings' +import StatusIndicator from '@/components/status/StatusIndicator' import { TabsList, TabsTrigger } from '@/components/ui/Tabs' import { useSettingsStore } from '@/stores/settings' import { useAuthStore } from '@/stores/state' import { cn } from '@/lib/utils' import { useTranslation } from 'react-i18next' import { navigationService } from '@/services/navigation' -import { ZapIcon, GithubIcon, LogOutIcon } from 'lucide-react' +import { ZapIcon, LogOutIcon } from 'lucide-react' import { Tooltip, TooltipContent, TooltipProvider, TooltipTrigger } from '@/components/ui/Tooltip' interface NavigationTabProps { @@ -56,17 +57,8 @@ function TabsNavigation() { export default function SiteHeader() { const { t } = useTranslation() - const { isGuestMode, coreVersion, apiVersion, username, webuiTitle, webuiDescription } = useAuthStore() - - const versionDisplay = (coreVersion && apiVersion) - ? `${coreVersion}/${apiVersion}` - : null; - - // Check if frontend needs rebuild (apiVersion ends with warning symbol) - const hasWarning = apiVersion?.endsWith('⚠️'); - const versionTooltip = hasWarning - ? t('header.frontendNeedsRebuild') - : versionDisplay ? `v${versionDisplay}` : ''; + const { isGuestMode, username, webuiTitle, webuiDescription } = useAuthStore() + const enableHealthCheck = useSettingsStore.use.enableHealthCheck() const handleLogout = () => { navigationService.navigateToLogin(); @@ -77,7 +69,6 @@ export default function SiteHeader() {
{webuiTitle && (
@@ -111,25 +102,7 @@ export default function SiteHeader() {