LightRAG/tests/test_orphan_connection_quality.py
clssck 69358d830d test(lightrag,examples,api): comprehensive ruff formatting and type hints
Format entire codebase with ruff and add type hints across all modules:
- Apply ruff formatting to all Python files (121 files, 17K insertions)
- Add type hints to function signatures throughout lightrag core and API
- Update test suite with improved type annotations and docstrings
- Add pyrightconfig.json for static type checking configuration
- Create prompt_optimized.py and test_extraction_prompt_ab.py test files
- Update ruff.toml and .gitignore for improved linting configuration
- Standardize code style across examples, reproduce scripts, and utilities
2025-12-05 15:17:06 +01:00

229 lines
9.1 KiB
Python

"""
Orphan Connection Quality Tests
Tests to validate that orphan connections improve (not poison) retrieval quality.
Test Categories:
1. Precision tests - Do orphan connections add relevant context?
2. Recall tests - Do orphan connections help find information that was missed?
3. Noise tests - Do orphan connections introduce irrelevant information?
4. A/B comparison - Same queries with/without connections
"""
from dataclasses import dataclass
@dataclass
class QueryTestCase:
"""A test case for evaluating retrieval quality."""
query: str
expected_entities: list[str] # Entities that SHOULD be retrieved
unexpected_entities: list[str] # Entities that should NOT be retrieved
description: str
category: str # "precision", "recall", "noise"
# Test cases designed to evaluate orphan connection quality
TEST_CASES = [
# PRECISION TESTS - Do we retrieve the RIGHT things?
QueryTestCase(
query='What types of neural networks are used in deep learning?',
expected_entities=[
'Neural Networks',
'Convolutional Neural Network',
'Recurrent Neural Network',
'Transformer',
],
unexpected_entities=['Quantum Computing', 'Climate Change', 'FDA'],
description='Should retrieve NN types via orphan connections (CNN->NN, RNN->NN)',
category='precision',
),
QueryTestCase(
query='What quantum computing hardware approaches exist?',
expected_entities=[
'Qubit',
'Trapped Ions',
'Superconducting Qubits',
'Photonic Qubits',
'Topological Qubits',
'IonQ',
],
unexpected_entities=['Neural Networks', 'Machine Learning', 'Climate Change'],
description='Should retrieve qubit types via orphan connections',
category='precision',
),
# RECALL TESTS - Do we find things we would have MISSED without connections?
QueryTestCase(
query='What companies are working on quantum computing?',
expected_entities=['IonQ', 'Microsoft', 'Google', 'IBM'],
unexpected_entities=[],
description='Should find IonQ (connected via Trapped Ions) and Microsoft (via Topological Qubits)',
category='recall',
),
QueryTestCase(
query='What are greenhouse gases?',
expected_entities=['Carbon Dioxide (CO2)', 'Methane (CH4)', 'Nitrous Oxide (N2O)', 'Fluorinated Gases'],
unexpected_entities=['Machine Learning', 'Quantum Computing'],
description='Should retrieve all GHGs via orphan connections forming a cluster',
category='recall',
),
# NOISE TESTS - Do we retrieve IRRELEVANT things?
QueryTestCase(
query='What is reinforcement learning?',
expected_entities=['Reinforcement Learning', 'Machine Learning'],
unexpected_entities=['Climate Change', 'FDA', 'Vehicle Emissions Standards'],
description='Should NOT pull in unrelated domains despite graph connectivity',
category='noise',
),
QueryTestCase(
query='How does computer vision work?',
expected_entities=[
'Computer Vision',
'Image Segmentation',
'Object Tracking',
'Feature Extraction',
'Edge Detection',
],
unexpected_entities=['Quantum Computing', 'Climate Modeling', 'Drug Discovery'],
description='Should retrieve CV techniques, not unrelated domains',
category='noise',
),
# EDGE CASE - Orphan connections shouldn't create nonsense pathways
QueryTestCase(
query='What is Amazon?',
expected_entities=['Amazon'],
unexpected_entities=[], # We connected Amazon -> Microsoft, is this causing issues?
description='Amazon query - check if connection to Microsoft causes retrieval issues',
category='noise',
),
]
async def run_query(rag, query: str, mode: str = 'local') -> dict:
"""Run a query and return retrieved entities."""
# This would need to be adapted based on how LightRAG returns context
result = await rag.aquery(query, param={'mode': mode})
return result
async def evaluate_test_case(rag, test_case: QueryTestCase) -> dict:
"""Evaluate a single test case."""
await run_query(rag, test_case.query)
# Extract retrieved entities from result
# (Implementation depends on LightRAG response format)
retrieved_entities = [] # Parse from result
# Calculate metrics
expected_found = [e for e in test_case.expected_entities if e in retrieved_entities]
unexpected_found = [e for e in test_case.unexpected_entities if e in retrieved_entities]
precision = len(expected_found) / len(retrieved_entities) if retrieved_entities else 0
recall = len(expected_found) / len(test_case.expected_entities) if test_case.expected_entities else 1
noise_rate = len(unexpected_found) / len(retrieved_entities) if retrieved_entities else 0
return {
'test_case': test_case.description,
'category': test_case.category,
'query': test_case.query,
'expected_found': expected_found,
'expected_missed': [e for e in test_case.expected_entities if e not in retrieved_entities],
'unexpected_found': unexpected_found,
'precision': precision,
'recall': recall,
'noise_rate': noise_rate,
'pass': len(unexpected_found) == 0 and recall > 0.5,
}
async def run_ab_comparison(rag_with_connections, rag_without_connections, query: str) -> dict:
"""
Compare retrieval results with and without orphan connections.
This requires two separate LightRAG instances:
- One with orphan connections applied
- One without (baseline)
"""
result_with = await run_query(rag_with_connections, query)
result_without = await run_query(rag_without_connections, query)
return {
'query': query,
'with_connections': result_with,
'without_connections': result_without,
'improved': None, # Human evaluation needed
}
def generate_test_report(results: list[dict]) -> str:
"""Generate a test report from evaluation results."""
report = ['# Orphan Connection Quality Test Report\n']
# Summary by category
for category in ['precision', 'recall', 'noise']:
cat_results = [r for r in results if r['category'] == category]
if cat_results:
passed = sum(1 for r in cat_results if r['pass'])
report.append(f'\n## {category.upper()} Tests: {passed}/{len(cat_results)} passed\n')
for r in cat_results:
status = '' if r['pass'] else ''
report.append(f'- {status} {r["test_case"]}')
if r.get('unexpected_found'):
report.append(f' - ⚠️ Noise detected: {r["unexpected_found"]}')
# Overall metrics
all_precision = [r['precision'] for r in results if r['precision'] is not None]
all_recall = [r['recall'] for r in results if r['recall'] is not None]
all_noise = [r['noise_rate'] for r in results if r['noise_rate'] is not None]
report.append('\n## Overall Metrics')
report.append(f'- Average Precision: {sum(all_precision) / len(all_precision):.2f}')
report.append(f'- Average Recall: {sum(all_recall) / len(all_recall):.2f}')
report.append(f'- Average Noise Rate: {sum(all_noise) / len(all_noise):.2f}')
return '\n'.join(report)
# Manual evaluation checklist
EVALUATION_CHECKLIST = """
## Manual Evaluation Checklist
For each orphan connection, evaluate:
1. **Semantic Validity** (Is the connection logically correct?)
- [ ] The entities are genuinely related
- [ ] The relationship type makes sense
- [ ] A human expert would agree with this connection
2. **Retrieval Impact** (Does this help or hurt queries?)
- [ ] Queries about entity A now appropriately include entity B
- [ ] Queries about entity B now appropriately include entity A
- [ ] No unrelated queries are polluted by this connection
3. **Specificity** (Is the connection too broad?)
- [ ] The connection is specific enough to be useful
- [ ] Not just "both are technology" or "both are nouns"
- [ ] The relationship description is meaningful
4. **Directionality** (Does the relationship make sense both ways?)
- [ ] Query for A -> retrieves B makes sense
- [ ] Query for B -> retrieves A makes sense
## Red Flags to Watch For:
- Connections between entirely different domains (e.g., Climate -> Quantum)
- Very low similarity scores with high confidence (LLM hallucination?)
- Hub entities getting too many connections (becoming noise magnets)
- Circular clusters forming (A->B->C->A with no external connections)
"""
if __name__ == '__main__':
print('Orphan Connection Quality Test Framework')
print('=' * 50)
print(f'Total test cases: {len(TEST_CASES)}')
print(f'- Precision tests: {len([t for t in TEST_CASES if t.category == "precision"])}')
print(f'- Recall tests: {len([t for t in TEST_CASES if t.category == "recall"])}')
print(f'- Noise tests: {len([t for t in TEST_CASES if t.category == "noise"])}')
print('\nRun with a LightRAG instance to execute tests.')
print(EVALUATION_CHECKLIST)