Implement automatic entity resolution to prevent duplicate nodes in the knowledge graph. The system uses a 3-layer approach: 1. Case-insensitive exact matching (free, instant) 2. Fuzzy string matching >85% threshold (free, instant) 3. Vector similarity + LLM verification (for acronyms/synonyms) Key features: - Pre-resolution phase prevents race conditions in parallel processing - Numeric suffix detection blocks false matches (IL-4 ≠ IL-13) - PostgreSQL alias cache for fast lookups on subsequent ingestion - Configurable thresholds via environment variables Bug fixes included: - Fix fuzzy matching false positives for numbered entities - Fix alias cache not being populated (missing db parameter) - Skip entity_aliases table from generic id index creation New files: - lightrag/entity_resolution/ - Core resolution module - tests/test_entity_resolution/ - Unit tests - docker/postgres-age-vector/ - Custom PG image with pgvector + AGE - docker-compose.test.yml - Integration test environment Configuration (env.example): - ENTITY_RESOLUTION_ENABLED=true - ENTITY_RESOLUTION_FUZZY_THRESHOLD=0.85 - ENTITY_RESOLUTION_VECTOR_THRESHOLD=0.5 - ENTITY_RESOLUTION_MAX_CANDIDATES=3
93 lines
2.5 KiB
Python
93 lines
2.5 KiB
Python
"""
|
|
Quick test for Entity Resolution feature.
|
|
|
|
Tests that:
|
|
1. "FDA" and "US Food and Drug Administration" resolve to the same entity
|
|
2. "Dupixant" (typo) matches "Dupixent" via fuzzy matching
|
|
"""
|
|
|
|
import asyncio
|
|
import os
|
|
import shutil
|
|
|
|
from lightrag import LightRAG
|
|
from lightrag.entity_resolution import EntityResolutionConfig
|
|
from lightrag.llm.openai import gpt_4o_mini_complete, openai_embed
|
|
from lightrag.utils import logger
|
|
import logging
|
|
|
|
WORKING_DIR = "./test_entity_resolution"
|
|
|
|
# Test document with entities that should be deduplicated
|
|
TEST_DOC = """
|
|
The FDA approved Dupixent for treating eczema in 2017.
|
|
The US Food and Drug Administration later expanded the drug's indications.
|
|
Dupixant (sometimes misspelled) has shown good results in clinical trials.
|
|
The FDA continues to monitor the safety of Dupixent.
|
|
"""
|
|
|
|
|
|
async def main():
|
|
if not os.getenv("OPENAI_API_KEY"):
|
|
print("Error: Set OPENAI_API_KEY environment variable")
|
|
return
|
|
|
|
# Clean up previous test
|
|
if os.path.exists(WORKING_DIR):
|
|
shutil.rmtree(WORKING_DIR)
|
|
os.makedirs(WORKING_DIR)
|
|
|
|
# Set up logging to see resolution messages
|
|
logging.basicConfig(level=logging.DEBUG)
|
|
logger.setLevel(logging.DEBUG)
|
|
|
|
print("\n" + "=" * 60)
|
|
print("Entity Resolution Test")
|
|
print("=" * 60)
|
|
|
|
rag = LightRAG(
|
|
working_dir=WORKING_DIR,
|
|
embedding_func=openai_embed,
|
|
llm_model_func=gpt_4o_mini_complete,
|
|
entity_resolution_config=EntityResolutionConfig(
|
|
enabled=True,
|
|
fuzzy_threshold=0.85,
|
|
vector_threshold=0.5,
|
|
max_candidates=3,
|
|
),
|
|
)
|
|
|
|
await rag.initialize_storages()
|
|
|
|
print("\nInserting test document...")
|
|
print(f"Document: {TEST_DOC.strip()}")
|
|
print("\n" + "-" * 60)
|
|
|
|
await rag.ainsert(TEST_DOC)
|
|
|
|
print("\n" + "-" * 60)
|
|
print("Checking extracted entities...")
|
|
|
|
# Read the graph to see what entities were created
|
|
graph_file = os.path.join(WORKING_DIR, "graph_chunk_entity_relation.graphml")
|
|
if os.path.exists(graph_file):
|
|
import networkx as nx
|
|
|
|
G = nx.read_graphml(graph_file)
|
|
print(f"\nEntities in graph ({len(G.nodes())} total):")
|
|
for node in sorted(G.nodes()):
|
|
print(f" - {node}")
|
|
|
|
print(f"\nRelationships: {len(G.edges())}")
|
|
else:
|
|
print("Graph file not found")
|
|
|
|
await rag.finalize_storages()
|
|
|
|
print("\n" + "=" * 60)
|
|
print("Test complete!")
|
|
print("=" * 60)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|