""" Unit tests for Entity Resolution Tests the 3-layer approach with mock embed_fn and llm_fn. No database or external services required. """ import pytest from lightrag.entity_resolution import ( EntityResolutionConfig, resolve_entity, ) # Mock embeddings - pre-computed for test entities # These simulate what an embedding model would return MOCK_EMBEDDINGS = { # FDA and full name have ~0.67 similarity (based on real test) "fda": [0.1, 0.2, 0.3, 0.4, 0.5], "us food and drug administration": [0.15, 0.25, 0.28, 0.38, 0.52], # Dupixent and dupilumab have ~0.63 similarity "dupixent": [0.5, 0.6, 0.7, 0.8, 0.9], "dupilumab": [0.48, 0.58, 0.72, 0.78, 0.88], # Celebrex and Cerebyx are different (low similarity) "celebrex": [0.9, 0.1, 0.2, 0.3, 0.4], "cerebyx": [0.1, 0.9, 0.8, 0.7, 0.6], # Default for unknown entities "default": [0.0, 0.0, 0.0, 0.0, 0.0], } # Mock LLM responses MOCK_LLM_RESPONSES = { ("fda", "us food and drug administration"): "YES", ("us food and drug administration", "fda"): "YES", ("dupixent", "dupilumab"): "YES", ("dupilumab", "dupixent"): "YES", ("heart attack", "myocardial infarction"): "YES", ("celebrex", "cerebyx"): "NO", ("metformin", "metoprolol"): "NO", } async def mock_embed_fn(text: str) -> list[float]: """Mock embedding function.""" key = text.lower().strip() return MOCK_EMBEDDINGS.get(key, MOCK_EMBEDDINGS["default"]) async def mock_llm_fn(prompt: str) -> str: """Mock LLM function that parses the prompt and returns YES/NO.""" # Extract term_a and term_b from the prompt lines = prompt.strip().split("\n") term_a = None term_b = None for line in lines: if line.startswith("Term A:"): term_a = line.replace("Term A:", "").strip().lower() elif line.startswith("Term B:"): term_b = line.replace("Term B:", "").strip().lower() if term_a and term_b: # Check both orderings response = MOCK_LLM_RESPONSES.get((term_a, term_b)) if response is None: response = MOCK_LLM_RESPONSES.get((term_b, term_a), "NO") return response return "NO" # Test fixtures @pytest.fixture def existing_entities(): """Existing entities in the knowledge graph.""" return [ ( "US Food and Drug Administration", MOCK_EMBEDDINGS["us food and drug administration"], ), ("Dupixent", MOCK_EMBEDDINGS["dupixent"]), ("Celebrex", MOCK_EMBEDDINGS["celebrex"]), ] @pytest.fixture def config(): """Default resolution config.""" return EntityResolutionConfig() # Layer 1: Case normalization tests class TestCaseNormalization: @pytest.mark.asyncio async def test_exact_match_same_case(self, existing_entities, config): """Exact match with same case.""" result = await resolve_entity( "Dupixent", existing_entities, mock_embed_fn, mock_llm_fn, config, ) assert result.action == "match" assert result.matched_entity == "Dupixent" assert result.method == "exact" assert result.confidence == 1.0 @pytest.mark.asyncio async def test_exact_match_different_case(self, existing_entities, config): """DUPIXENT should match Dupixent via case normalization.""" result = await resolve_entity( "DUPIXENT", existing_entities, mock_embed_fn, mock_llm_fn, config, ) assert result.action == "match" assert result.matched_entity == "Dupixent" assert result.method == "exact" @pytest.mark.asyncio async def test_exact_match_lowercase(self, existing_entities, config): """dupixent should match Dupixent.""" result = await resolve_entity( "dupixent", existing_entities, mock_embed_fn, mock_llm_fn, config, ) assert result.action == "match" assert result.method == "exact" # Layer 2: Fuzzy matching tests class TestFuzzyMatching: @pytest.mark.asyncio async def test_fuzzy_match_typo(self, existing_entities, config): """Dupixant (typo) should match Dupixent via fuzzy matching (88%).""" result = await resolve_entity( "Dupixant", existing_entities, mock_embed_fn, mock_llm_fn, config, ) assert result.action == "match" assert result.matched_entity == "Dupixent" assert result.method == "fuzzy" assert result.confidence >= 0.85 @pytest.mark.asyncio async def test_fuzzy_rejects_below_threshold(self, existing_entities, config): """Celebrex vs Cerebyx is 67% - should NOT fuzzy match.""" # Add Cerebyx as the query (Celebrex exists) result = await resolve_entity( "Cerebyx", existing_entities, mock_embed_fn, mock_llm_fn, config, ) # Should not be fuzzy match (67% < 85%) assert result.method != "fuzzy" or result.action == "new" # Layer 3: LLM verification tests class TestLLMVerification: @pytest.mark.asyncio async def test_llm_matches_acronym(self, existing_entities, config): """FDA should match US Food and Drug Administration via LLM.""" result = await resolve_entity( "FDA", existing_entities, mock_embed_fn, mock_llm_fn, config, ) assert result.action == "match" assert result.matched_entity == "US Food and Drug Administration" assert result.method == "llm" @pytest.mark.asyncio async def test_llm_matches_brand_generic(self, config): """Dupixent should match dupilumab via LLM.""" existing = [ ("dupilumab", MOCK_EMBEDDINGS["dupilumab"]), ] result = await resolve_entity( "Dupixent", existing, mock_embed_fn, mock_llm_fn, config, ) assert result.action == "match" assert result.matched_entity == "dupilumab" assert result.method == "llm" # Edge cases class TestEdgeCases: @pytest.mark.asyncio async def test_empty_existing_entities(self, config): """New entity when no existing entities.""" result = await resolve_entity( "NewEntity", [], mock_embed_fn, mock_llm_fn, config, ) assert result.action == "new" @pytest.mark.asyncio async def test_disabled_resolution(self, existing_entities): """Resolution disabled returns new.""" config = EntityResolutionConfig(enabled=False) result = await resolve_entity( "Dupixent", existing_entities, mock_embed_fn, mock_llm_fn, config, ) assert result.action == "new" assert result.method == "disabled" @pytest.mark.asyncio async def test_genuinely_new_entity(self, existing_entities, config): """Completely new entity should return 'new'.""" result = await resolve_entity( "CompletelyNewDrug", existing_entities, mock_embed_fn, mock_llm_fn, config, ) assert result.action == "new" assert result.method == "none"