LightRAG/examples/test_entity_resolution.py

"""
Quick test for Entity Resolution feature.

Tests that:
1. "FDA" and "US Food and Drug Administration" resolve to the same entity
2. "Dupixant" (typo) matches "Dupixent" via fuzzy matching
"""

import asyncio
import logging
import os
import shutil

from lightrag import LightRAG
from lightrag.entity_resolution import EntityResolutionConfig
from lightrag.llm.openai import gpt_4o_mini_complete, openai_embed
from lightrag.utils import logger

WORKING_DIR = './test_entity_resolution'

# Test document with entities that should be deduplicated
TEST_DOC = """
The FDA approved Dupixent for treating eczema in 2017.
The US Food and Drug Administration later expanded the drug's indications.
Dupixant (sometimes misspelled) has shown good results in clinical trials.
The FDA continues to monitor the safety of Dupixent.
"""


async def main():
    if not os.getenv('OPENAI_API_KEY'):
        print('Error: Set OPENAI_API_KEY environment variable')
        return

    # Clean up previous test
    if os.path.exists(WORKING_DIR):
        shutil.rmtree(WORKING_DIR)
    os.makedirs(WORKING_DIR)

    # Set up logging to see resolution messages
    logging.basicConfig(level=logging.DEBUG)
    logger.setLevel(logging.DEBUG)

    print('\n' + '=' * 60)
    print('Entity Resolution Test')
    print('=' * 60)

    rag = LightRAG(
        working_dir=WORKING_DIR,
        embedding_func=openai_embed,
        llm_model_func=gpt_4o_mini_complete,
        entity_resolution_config=EntityResolutionConfig(
            enabled=True,
            fuzzy_threshold=0.85,
            vector_threshold=0.5,
            max_candidates=3,
        ),
    )

    await rag.initialize_storages()

    print('\nInserting test document...')
    print(f'Document: {TEST_DOC.strip()}')
    print('\n' + '-' * 60)

    await rag.ainsert(TEST_DOC)

    print('\n' + '-' * 60)
    print('Checking extracted entities...')

    # Read the graph to see what entities were created
    graph_file = os.path.join(WORKING_DIR, 'graph_chunk_entity_relation.graphml')
    if os.path.exists(graph_file):
        import networkx as nx

        G = nx.read_graphml(graph_file)
        print(f'\nEntities in graph ({len(G.nodes())} total):')
        for node in sorted(G.nodes()):
            print(f'  - {node}')

        print(f'\nRelationships: {len(G.edges())}')
    else:
        print('Graph file not found')

    await rag.finalize_storages()

    print('\n' + '=' * 60)
    print('Test complete!')
    print('=' * 60)


if __name__ == '__main__':
    asyncio.run(main())