Format entire codebase with ruff and add type hints across all modules: - Apply ruff formatting to all Python files (121 files, 17K insertions) - Add type hints to function signatures throughout lightrag core and API - Update test suite with improved type annotations and docstrings - Add pyrightconfig.json for static type checking configuration - Create prompt_optimized.py and test_extraction_prompt_ab.py test files - Update ruff.toml and .gitignore for improved linting configuration - Standardize code style across examples, reproduce scripts, and utilities
93 lines
2.5 KiB
Python
93 lines
2.5 KiB
Python
"""
|
|
Quick test for Entity Resolution feature.
|
|
|
|
Tests that:
|
|
1. "FDA" and "US Food and Drug Administration" resolve to the same entity
|
|
2. "Dupixant" (typo) matches "Dupixent" via fuzzy matching
|
|
"""
|
|
|
|
import asyncio
|
|
import logging
|
|
import os
|
|
import shutil
|
|
|
|
from lightrag import LightRAG
|
|
from lightrag.entity_resolution import EntityResolutionConfig
|
|
from lightrag.llm.openai import gpt_4o_mini_complete, openai_embed
|
|
from lightrag.utils import logger
|
|
|
|
WORKING_DIR = './test_entity_resolution'
|
|
|
|
# Test document with entities that should be deduplicated
|
|
TEST_DOC = """
|
|
The FDA approved Dupixent for treating eczema in 2017.
|
|
The US Food and Drug Administration later expanded the drug's indications.
|
|
Dupixant (sometimes misspelled) has shown good results in clinical trials.
|
|
The FDA continues to monitor the safety of Dupixent.
|
|
"""
|
|
|
|
|
|
async def main():
|
|
if not os.getenv('OPENAI_API_KEY'):
|
|
print('Error: Set OPENAI_API_KEY environment variable')
|
|
return
|
|
|
|
# Clean up previous test
|
|
if os.path.exists(WORKING_DIR):
|
|
shutil.rmtree(WORKING_DIR)
|
|
os.makedirs(WORKING_DIR)
|
|
|
|
# Set up logging to see resolution messages
|
|
logging.basicConfig(level=logging.DEBUG)
|
|
logger.setLevel(logging.DEBUG)
|
|
|
|
print('\n' + '=' * 60)
|
|
print('Entity Resolution Test')
|
|
print('=' * 60)
|
|
|
|
rag = LightRAG(
|
|
working_dir=WORKING_DIR,
|
|
embedding_func=openai_embed,
|
|
llm_model_func=gpt_4o_mini_complete,
|
|
entity_resolution_config=EntityResolutionConfig(
|
|
enabled=True,
|
|
fuzzy_threshold=0.85,
|
|
vector_threshold=0.5,
|
|
max_candidates=3,
|
|
),
|
|
)
|
|
|
|
await rag.initialize_storages()
|
|
|
|
print('\nInserting test document...')
|
|
print(f'Document: {TEST_DOC.strip()}')
|
|
print('\n' + '-' * 60)
|
|
|
|
await rag.ainsert(TEST_DOC)
|
|
|
|
print('\n' + '-' * 60)
|
|
print('Checking extracted entities...')
|
|
|
|
# Read the graph to see what entities were created
|
|
graph_file = os.path.join(WORKING_DIR, 'graph_chunk_entity_relation.graphml')
|
|
if os.path.exists(graph_file):
|
|
import networkx as nx
|
|
|
|
G = nx.read_graphml(graph_file)
|
|
print(f'\nEntities in graph ({len(G.nodes())} total):')
|
|
for node in sorted(G.nodes()):
|
|
print(f' - {node}')
|
|
|
|
print(f'\nRelationships: {len(G.edges())}')
|
|
else:
|
|
print('Graph file not found')
|
|
|
|
await rag.finalize_storages()
|
|
|
|
print('\n' + '=' * 60)
|
|
print('Test complete!')
|
|
print('=' * 60)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
asyncio.run(main())
|