""" Orphan Connection Quality Tests Tests to validate that orphan connections improve (not poison) retrieval quality. Test Categories: 1. Precision tests - Do orphan connections add relevant context? 2. Recall tests - Do orphan connections help find information that was missed? 3. Noise tests - Do orphan connections introduce irrelevant information? 4. A/B comparison - Same queries with/without connections """ from dataclasses import dataclass @dataclass class QueryTestCase: """A test case for evaluating retrieval quality.""" query: str expected_entities: list[str] # Entities that SHOULD be retrieved unexpected_entities: list[str] # Entities that should NOT be retrieved description: str category: str # "precision", "recall", "noise" # Test cases designed to evaluate orphan connection quality TEST_CASES = [ # PRECISION TESTS - Do we retrieve the RIGHT things? QueryTestCase( query='What types of neural networks are used in deep learning?', expected_entities=[ 'Neural Networks', 'Convolutional Neural Network', 'Recurrent Neural Network', 'Transformer', ], unexpected_entities=['Quantum Computing', 'Climate Change', 'FDA'], description='Should retrieve NN types via orphan connections (CNN->NN, RNN->NN)', category='precision', ), QueryTestCase( query='What quantum computing hardware approaches exist?', expected_entities=[ 'Qubit', 'Trapped Ions', 'Superconducting Qubits', 'Photonic Qubits', 'Topological Qubits', 'IonQ', ], unexpected_entities=['Neural Networks', 'Machine Learning', 'Climate Change'], description='Should retrieve qubit types via orphan connections', category='precision', ), # RECALL TESTS - Do we find things we would have MISSED without connections? QueryTestCase( query='What companies are working on quantum computing?', expected_entities=['IonQ', 'Microsoft', 'Google', 'IBM'], unexpected_entities=[], description='Should find IonQ (connected via Trapped Ions) and Microsoft (via Topological Qubits)', category='recall', ), QueryTestCase( query='What are greenhouse gases?', expected_entities=['Carbon Dioxide (CO2)', 'Methane (CH4)', 'Nitrous Oxide (N2O)', 'Fluorinated Gases'], unexpected_entities=['Machine Learning', 'Quantum Computing'], description='Should retrieve all GHGs via orphan connections forming a cluster', category='recall', ), # NOISE TESTS - Do we retrieve IRRELEVANT things? QueryTestCase( query='What is reinforcement learning?', expected_entities=['Reinforcement Learning', 'Machine Learning'], unexpected_entities=['Climate Change', 'FDA', 'Vehicle Emissions Standards'], description='Should NOT pull in unrelated domains despite graph connectivity', category='noise', ), QueryTestCase( query='How does computer vision work?', expected_entities=[ 'Computer Vision', 'Image Segmentation', 'Object Tracking', 'Feature Extraction', 'Edge Detection', ], unexpected_entities=['Quantum Computing', 'Climate Modeling', 'Drug Discovery'], description='Should retrieve CV techniques, not unrelated domains', category='noise', ), # EDGE CASE - Orphan connections shouldn't create nonsense pathways QueryTestCase( query='What is Amazon?', expected_entities=['Amazon'], unexpected_entities=[], # We connected Amazon -> Microsoft, is this causing issues? description='Amazon query - check if connection to Microsoft causes retrieval issues', category='noise', ), ] async def run_query(rag, query: str, mode: str = 'local') -> dict: """Run a query and return retrieved entities.""" # This would need to be adapted based on how LightRAG returns context result = await rag.aquery(query, param={'mode': mode}) return result async def evaluate_test_case(rag, test_case: QueryTestCase) -> dict: """Evaluate a single test case.""" await run_query(rag, test_case.query) # Extract retrieved entities from result # (Implementation depends on LightRAG response format) retrieved_entities = [] # Parse from result # Calculate metrics expected_found = [e for e in test_case.expected_entities if e in retrieved_entities] unexpected_found = [e for e in test_case.unexpected_entities if e in retrieved_entities] precision = len(expected_found) / len(retrieved_entities) if retrieved_entities else 0 recall = len(expected_found) / len(test_case.expected_entities) if test_case.expected_entities else 1 noise_rate = len(unexpected_found) / len(retrieved_entities) if retrieved_entities else 0 return { 'test_case': test_case.description, 'category': test_case.category, 'query': test_case.query, 'expected_found': expected_found, 'expected_missed': [e for e in test_case.expected_entities if e not in retrieved_entities], 'unexpected_found': unexpected_found, 'precision': precision, 'recall': recall, 'noise_rate': noise_rate, 'pass': len(unexpected_found) == 0 and recall > 0.5, } async def run_ab_comparison(rag_with_connections, rag_without_connections, query: str) -> dict: """ Compare retrieval results with and without orphan connections. This requires two separate LightRAG instances: - One with orphan connections applied - One without (baseline) """ result_with = await run_query(rag_with_connections, query) result_without = await run_query(rag_without_connections, query) return { 'query': query, 'with_connections': result_with, 'without_connections': result_without, 'improved': None, # Human evaluation needed } def generate_test_report(results: list[dict]) -> str: """Generate a test report from evaluation results.""" report = ['# Orphan Connection Quality Test Report\n'] # Summary by category for category in ['precision', 'recall', 'noise']: cat_results = [r for r in results if r['category'] == category] if cat_results: passed = sum(1 for r in cat_results if r['pass']) report.append(f'\n## {category.upper()} Tests: {passed}/{len(cat_results)} passed\n') for r in cat_results: status = '✅' if r['pass'] else '❌' report.append(f'- {status} {r["test_case"]}') if r.get('unexpected_found'): report.append(f' - ⚠️ Noise detected: {r["unexpected_found"]}') # Overall metrics all_precision = [r['precision'] for r in results if r['precision'] is not None] all_recall = [r['recall'] for r in results if r['recall'] is not None] all_noise = [r['noise_rate'] for r in results if r['noise_rate'] is not None] report.append('\n## Overall Metrics') report.append(f'- Average Precision: {sum(all_precision) / len(all_precision):.2f}') report.append(f'- Average Recall: {sum(all_recall) / len(all_recall):.2f}') report.append(f'- Average Noise Rate: {sum(all_noise) / len(all_noise):.2f}') return '\n'.join(report) # Manual evaluation checklist EVALUATION_CHECKLIST = """ ## Manual Evaluation Checklist For each orphan connection, evaluate: 1. **Semantic Validity** (Is the connection logically correct?) - [ ] The entities are genuinely related - [ ] The relationship type makes sense - [ ] A human expert would agree with this connection 2. **Retrieval Impact** (Does this help or hurt queries?) - [ ] Queries about entity A now appropriately include entity B - [ ] Queries about entity B now appropriately include entity A - [ ] No unrelated queries are polluted by this connection 3. **Specificity** (Is the connection too broad?) - [ ] The connection is specific enough to be useful - [ ] Not just "both are technology" or "both are nouns" - [ ] The relationship description is meaningful 4. **Directionality** (Does the relationship make sense both ways?) - [ ] Query for A -> retrieves B makes sense - [ ] Query for B -> retrieves A makes sense ## Red Flags to Watch For: - Connections between entirely different domains (e.g., Climate -> Quantum) - Very low similarity scores with high confidence (LLM hallucination?) - Hub entities getting too many connections (becoming noise magnets) - Circular clusters forming (A->B->C->A with no external connections) """ if __name__ == '__main__': print('Orphan Connection Quality Test Framework') print('=' * 50) print(f'Total test cases: {len(TEST_CASES)}') print(f'- Precision tests: {len([t for t in TEST_CASES if t.category == "precision"])}') print(f'- Recall tests: {len([t for t in TEST_CASES if t.category == "recall"])}') print(f'- Noise tests: {len([t for t in TEST_CASES if t.category == "noise"])}') print('\nRun with a LightRAG instance to execute tests.') print(EVALUATION_CHECKLIST)