Format entire codebase with ruff and add type hints across all modules: - Apply ruff formatting to all Python files (121 files, 17K insertions) - Add type hints to function signatures throughout lightrag core and API - Update test suite with improved type annotations and docstrings - Add pyrightconfig.json for static type checking configuration - Create prompt_optimized.py and test_extraction_prompt_ab.py test files - Update ruff.toml and .gitignore for improved linting configuration - Standardize code style across examples, reproduce scripts, and utilities
229 lines
9.1 KiB
Python
229 lines
9.1 KiB
Python
"""
|
|
Orphan Connection Quality Tests
|
|
|
|
Tests to validate that orphan connections improve (not poison) retrieval quality.
|
|
|
|
Test Categories:
|
|
1. Precision tests - Do orphan connections add relevant context?
|
|
2. Recall tests - Do orphan connections help find information that was missed?
|
|
3. Noise tests - Do orphan connections introduce irrelevant information?
|
|
4. A/B comparison - Same queries with/without connections
|
|
"""
|
|
|
|
from dataclasses import dataclass
|
|
|
|
|
|
@dataclass
|
|
class QueryTestCase:
|
|
"""A test case for evaluating retrieval quality."""
|
|
|
|
query: str
|
|
expected_entities: list[str] # Entities that SHOULD be retrieved
|
|
unexpected_entities: list[str] # Entities that should NOT be retrieved
|
|
description: str
|
|
category: str # "precision", "recall", "noise"
|
|
|
|
|
|
# Test cases designed to evaluate orphan connection quality
|
|
TEST_CASES = [
|
|
# PRECISION TESTS - Do we retrieve the RIGHT things?
|
|
QueryTestCase(
|
|
query='What types of neural networks are used in deep learning?',
|
|
expected_entities=[
|
|
'Neural Networks',
|
|
'Convolutional Neural Network',
|
|
'Recurrent Neural Network',
|
|
'Transformer',
|
|
],
|
|
unexpected_entities=['Quantum Computing', 'Climate Change', 'FDA'],
|
|
description='Should retrieve NN types via orphan connections (CNN->NN, RNN->NN)',
|
|
category='precision',
|
|
),
|
|
QueryTestCase(
|
|
query='What quantum computing hardware approaches exist?',
|
|
expected_entities=[
|
|
'Qubit',
|
|
'Trapped Ions',
|
|
'Superconducting Qubits',
|
|
'Photonic Qubits',
|
|
'Topological Qubits',
|
|
'IonQ',
|
|
],
|
|
unexpected_entities=['Neural Networks', 'Machine Learning', 'Climate Change'],
|
|
description='Should retrieve qubit types via orphan connections',
|
|
category='precision',
|
|
),
|
|
# RECALL TESTS - Do we find things we would have MISSED without connections?
|
|
QueryTestCase(
|
|
query='What companies are working on quantum computing?',
|
|
expected_entities=['IonQ', 'Microsoft', 'Google', 'IBM'],
|
|
unexpected_entities=[],
|
|
description='Should find IonQ (connected via Trapped Ions) and Microsoft (via Topological Qubits)',
|
|
category='recall',
|
|
),
|
|
QueryTestCase(
|
|
query='What are greenhouse gases?',
|
|
expected_entities=['Carbon Dioxide (CO2)', 'Methane (CH4)', 'Nitrous Oxide (N2O)', 'Fluorinated Gases'],
|
|
unexpected_entities=['Machine Learning', 'Quantum Computing'],
|
|
description='Should retrieve all GHGs via orphan connections forming a cluster',
|
|
category='recall',
|
|
),
|
|
# NOISE TESTS - Do we retrieve IRRELEVANT things?
|
|
QueryTestCase(
|
|
query='What is reinforcement learning?',
|
|
expected_entities=['Reinforcement Learning', 'Machine Learning'],
|
|
unexpected_entities=['Climate Change', 'FDA', 'Vehicle Emissions Standards'],
|
|
description='Should NOT pull in unrelated domains despite graph connectivity',
|
|
category='noise',
|
|
),
|
|
QueryTestCase(
|
|
query='How does computer vision work?',
|
|
expected_entities=[
|
|
'Computer Vision',
|
|
'Image Segmentation',
|
|
'Object Tracking',
|
|
'Feature Extraction',
|
|
'Edge Detection',
|
|
],
|
|
unexpected_entities=['Quantum Computing', 'Climate Modeling', 'Drug Discovery'],
|
|
description='Should retrieve CV techniques, not unrelated domains',
|
|
category='noise',
|
|
),
|
|
# EDGE CASE - Orphan connections shouldn't create nonsense pathways
|
|
QueryTestCase(
|
|
query='What is Amazon?',
|
|
expected_entities=['Amazon'],
|
|
unexpected_entities=[], # We connected Amazon -> Microsoft, is this causing issues?
|
|
description='Amazon query - check if connection to Microsoft causes retrieval issues',
|
|
category='noise',
|
|
),
|
|
]
|
|
|
|
|
|
async def run_query(rag, query: str, mode: str = 'local') -> dict:
|
|
"""Run a query and return retrieved entities."""
|
|
# This would need to be adapted based on how LightRAG returns context
|
|
result = await rag.aquery(query, param={'mode': mode})
|
|
return result
|
|
|
|
|
|
async def evaluate_test_case(rag, test_case: QueryTestCase) -> dict:
|
|
"""Evaluate a single test case."""
|
|
await run_query(rag, test_case.query)
|
|
|
|
# Extract retrieved entities from result
|
|
# (Implementation depends on LightRAG response format)
|
|
retrieved_entities = [] # Parse from result
|
|
|
|
# Calculate metrics
|
|
expected_found = [e for e in test_case.expected_entities if e in retrieved_entities]
|
|
unexpected_found = [e for e in test_case.unexpected_entities if e in retrieved_entities]
|
|
|
|
precision = len(expected_found) / len(retrieved_entities) if retrieved_entities else 0
|
|
recall = len(expected_found) / len(test_case.expected_entities) if test_case.expected_entities else 1
|
|
noise_rate = len(unexpected_found) / len(retrieved_entities) if retrieved_entities else 0
|
|
|
|
return {
|
|
'test_case': test_case.description,
|
|
'category': test_case.category,
|
|
'query': test_case.query,
|
|
'expected_found': expected_found,
|
|
'expected_missed': [e for e in test_case.expected_entities if e not in retrieved_entities],
|
|
'unexpected_found': unexpected_found,
|
|
'precision': precision,
|
|
'recall': recall,
|
|
'noise_rate': noise_rate,
|
|
'pass': len(unexpected_found) == 0 and recall > 0.5,
|
|
}
|
|
|
|
|
|
async def run_ab_comparison(rag_with_connections, rag_without_connections, query: str) -> dict:
|
|
"""
|
|
Compare retrieval results with and without orphan connections.
|
|
|
|
This requires two separate LightRAG instances:
|
|
- One with orphan connections applied
|
|
- One without (baseline)
|
|
"""
|
|
result_with = await run_query(rag_with_connections, query)
|
|
result_without = await run_query(rag_without_connections, query)
|
|
|
|
return {
|
|
'query': query,
|
|
'with_connections': result_with,
|
|
'without_connections': result_without,
|
|
'improved': None, # Human evaluation needed
|
|
}
|
|
|
|
|
|
def generate_test_report(results: list[dict]) -> str:
|
|
"""Generate a test report from evaluation results."""
|
|
report = ['# Orphan Connection Quality Test Report\n']
|
|
|
|
# Summary by category
|
|
for category in ['precision', 'recall', 'noise']:
|
|
cat_results = [r for r in results if r['category'] == category]
|
|
if cat_results:
|
|
passed = sum(1 for r in cat_results if r['pass'])
|
|
report.append(f'\n## {category.upper()} Tests: {passed}/{len(cat_results)} passed\n')
|
|
for r in cat_results:
|
|
status = '✅' if r['pass'] else '❌'
|
|
report.append(f'- {status} {r["test_case"]}')
|
|
if r.get('unexpected_found'):
|
|
report.append(f' - ⚠️ Noise detected: {r["unexpected_found"]}')
|
|
|
|
# Overall metrics
|
|
all_precision = [r['precision'] for r in results if r['precision'] is not None]
|
|
all_recall = [r['recall'] for r in results if r['recall'] is not None]
|
|
all_noise = [r['noise_rate'] for r in results if r['noise_rate'] is not None]
|
|
|
|
report.append('\n## Overall Metrics')
|
|
report.append(f'- Average Precision: {sum(all_precision) / len(all_precision):.2f}')
|
|
report.append(f'- Average Recall: {sum(all_recall) / len(all_recall):.2f}')
|
|
report.append(f'- Average Noise Rate: {sum(all_noise) / len(all_noise):.2f}')
|
|
|
|
return '\n'.join(report)
|
|
|
|
|
|
# Manual evaluation checklist
|
|
EVALUATION_CHECKLIST = """
|
|
## Manual Evaluation Checklist
|
|
|
|
For each orphan connection, evaluate:
|
|
|
|
1. **Semantic Validity** (Is the connection logically correct?)
|
|
- [ ] The entities are genuinely related
|
|
- [ ] The relationship type makes sense
|
|
- [ ] A human expert would agree with this connection
|
|
|
|
2. **Retrieval Impact** (Does this help or hurt queries?)
|
|
- [ ] Queries about entity A now appropriately include entity B
|
|
- [ ] Queries about entity B now appropriately include entity A
|
|
- [ ] No unrelated queries are polluted by this connection
|
|
|
|
3. **Specificity** (Is the connection too broad?)
|
|
- [ ] The connection is specific enough to be useful
|
|
- [ ] Not just "both are technology" or "both are nouns"
|
|
- [ ] The relationship description is meaningful
|
|
|
|
4. **Directionality** (Does the relationship make sense both ways?)
|
|
- [ ] Query for A -> retrieves B makes sense
|
|
- [ ] Query for B -> retrieves A makes sense
|
|
|
|
## Red Flags to Watch For:
|
|
- Connections between entirely different domains (e.g., Climate -> Quantum)
|
|
- Very low similarity scores with high confidence (LLM hallucination?)
|
|
- Hub entities getting too many connections (becoming noise magnets)
|
|
- Circular clusters forming (A->B->C->A with no external connections)
|
|
"""
|
|
|
|
|
|
if __name__ == '__main__':
|
|
print('Orphan Connection Quality Test Framework')
|
|
print('=' * 50)
|
|
print(f'Total test cases: {len(TEST_CASES)}')
|
|
print(f'- Precision tests: {len([t for t in TEST_CASES if t.category == "precision"])}')
|
|
print(f'- Recall tests: {len([t for t in TEST_CASES if t.category == "recall"])}')
|
|
print(f'- Noise tests: {len([t for t in TEST_CASES if t.category == "noise"])}')
|
|
print('\nRun with a LightRAG instance to execute tests.')
|
|
print(EVALUATION_CHECKLIST)
|