LightRAG/tests/test_orphan_connection_quality.py

"""
Orphan Connection Quality Tests

Tests to validate that orphan connections improve (not poison) retrieval quality.

Test Categories:
1. Precision tests - Do orphan connections add relevant context?
2. Recall tests - Do orphan connections help find information that was missed?
3. Noise tests - Do orphan connections introduce irrelevant information?
4. A/B comparison - Same queries with/without connections
"""

from dataclasses import dataclass


@dataclass
class QueryTestCase:
    """A test case for evaluating retrieval quality."""

    query: str
    expected_entities: list[str]  # Entities that SHOULD be retrieved
    unexpected_entities: list[str]  # Entities that should NOT be retrieved
    description: str
    category: str  # "precision", "recall", "noise"


# Test cases designed to evaluate orphan connection quality
TEST_CASES = [
    # PRECISION TESTS - Do we retrieve the RIGHT things?
    QueryTestCase(
        query='What types of neural networks are used in deep learning?',
        expected_entities=[
            'Neural Networks',
            'Convolutional Neural Network',
            'Recurrent Neural Network',
            'Transformer',
        ],
        unexpected_entities=['Quantum Computing', 'Climate Change', 'FDA'],
        description='Should retrieve NN types via orphan connections (CNN->NN, RNN->NN)',
        category='precision',
    ),
    QueryTestCase(
        query='What quantum computing hardware approaches exist?',
        expected_entities=[
            'Qubit',
            'Trapped Ions',
            'Superconducting Qubits',
            'Photonic Qubits',
            'Topological Qubits',
            'IonQ',
        ],
        unexpected_entities=['Neural Networks', 'Machine Learning', 'Climate Change'],
        description='Should retrieve qubit types via orphan connections',
        category='precision',
    ),
    # RECALL TESTS - Do we find things we would have MISSED without connections?
    QueryTestCase(
        query='What companies are working on quantum computing?',
        expected_entities=['IonQ', 'Microsoft', 'Google', 'IBM'],
        unexpected_entities=[],
        description='Should find IonQ (connected via Trapped Ions) and Microsoft (via Topological Qubits)',
        category='recall',
    ),
    QueryTestCase(
        query='What are greenhouse gases?',
        expected_entities=['Carbon Dioxide (CO2)', 'Methane (CH4)', 'Nitrous Oxide (N2O)', 'Fluorinated Gases'],
        unexpected_entities=['Machine Learning', 'Quantum Computing'],
        description='Should retrieve all GHGs via orphan connections forming a cluster',
        category='recall',
    ),
    # NOISE TESTS - Do we retrieve IRRELEVANT things?
    QueryTestCase(
        query='What is reinforcement learning?',
        expected_entities=['Reinforcement Learning', 'Machine Learning'],
        unexpected_entities=['Climate Change', 'FDA', 'Vehicle Emissions Standards'],
        description='Should NOT pull in unrelated domains despite graph connectivity',
        category='noise',
    ),
    QueryTestCase(
        query='How does computer vision work?',
        expected_entities=[
            'Computer Vision',
            'Image Segmentation',
            'Object Tracking',
            'Feature Extraction',
            'Edge Detection',
        ],
        unexpected_entities=['Quantum Computing', 'Climate Modeling', 'Drug Discovery'],
        description='Should retrieve CV techniques, not unrelated domains',
        category='noise',
    ),
    # EDGE CASE - Orphan connections shouldn't create nonsense pathways
    QueryTestCase(
        query='What is Amazon?',
        expected_entities=['Amazon'],
        unexpected_entities=[],  # We connected Amazon -> Microsoft, is this causing issues?
        description='Amazon query - check if connection to Microsoft causes retrieval issues',
        category='noise',
    ),
]


async def run_query(rag, query: str, mode: str = 'local') -> dict:
    """Run a query and return retrieved entities."""
    # This would need to be adapted based on how LightRAG returns context
    result = await rag.aquery(query, param={'mode': mode})
    return result


async def evaluate_test_case(rag, test_case: QueryTestCase) -> dict:
    """Evaluate a single test case."""
    await run_query(rag, test_case.query)

    # Extract retrieved entities from result
    # (Implementation depends on LightRAG response format)
    retrieved_entities = []  # Parse from result

    # Calculate metrics
    expected_found = [e for e in test_case.expected_entities if e in retrieved_entities]
    unexpected_found = [e for e in test_case.unexpected_entities if e in retrieved_entities]

    precision = len(expected_found) / len(retrieved_entities) if retrieved_entities else 0
    recall = len(expected_found) / len(test_case.expected_entities) if test_case.expected_entities else 1
    noise_rate = len(unexpected_found) / len(retrieved_entities) if retrieved_entities else 0

    return {
        'test_case': test_case.description,
        'category': test_case.category,
        'query': test_case.query,
        'expected_found': expected_found,
        'expected_missed': [e for e in test_case.expected_entities if e not in retrieved_entities],
        'unexpected_found': unexpected_found,
        'precision': precision,
        'recall': recall,
        'noise_rate': noise_rate,
        'pass': len(unexpected_found) == 0 and recall > 0.5,
    }


async def run_ab_comparison(rag_with_connections, rag_without_connections, query: str) -> dict:
    """
    Compare retrieval results with and without orphan connections.

    This requires two separate LightRAG instances:
    - One with orphan connections applied
    - One without (baseline)
    """
    result_with = await run_query(rag_with_connections, query)
    result_without = await run_query(rag_without_connections, query)

    return {
        'query': query,
        'with_connections': result_with,
        'without_connections': result_without,
        'improved': None,  # Human evaluation needed
    }


def generate_test_report(results: list[dict]) -> str:
    """Generate a test report from evaluation results."""
    report = ['# Orphan Connection Quality Test Report\n']

    # Summary by category
    for category in ['precision', 'recall', 'noise']:
        cat_results = [r for r in results if r['category'] == category]
        if cat_results:
            passed = sum(1 for r in cat_results if r['pass'])
            report.append(f'\n## {category.upper()} Tests: {passed}/{len(cat_results)} passed\n')
            for r in cat_results:
                status = '✅' if r['pass'] else '❌'
                report.append(f'- {status} {r["test_case"]}')
                if r.get('unexpected_found'):
                    report.append(f'  - ⚠️ Noise detected: {r["unexpected_found"]}')

    # Overall metrics
    all_precision = [r['precision'] for r in results if r['precision'] is not None]
    all_recall = [r['recall'] for r in results if r['recall'] is not None]
    all_noise = [r['noise_rate'] for r in results if r['noise_rate'] is not None]

    report.append('\n## Overall Metrics')
    report.append(f'- Average Precision: {sum(all_precision) / len(all_precision):.2f}')
    report.append(f'- Average Recall: {sum(all_recall) / len(all_recall):.2f}')
    report.append(f'- Average Noise Rate: {sum(all_noise) / len(all_noise):.2f}')

    return '\n'.join(report)


# Manual evaluation checklist
EVALUATION_CHECKLIST = """
## Manual Evaluation Checklist

For each orphan connection, evaluate:

1. **Semantic Validity** (Is the connection logically correct?)
   - [ ] The entities are genuinely related
   - [ ] The relationship type makes sense
   - [ ] A human expert would agree with this connection

2. **Retrieval Impact** (Does this help or hurt queries?)
   - [ ] Queries about entity A now appropriately include entity B
   - [ ] Queries about entity B now appropriately include entity A
   - [ ] No unrelated queries are polluted by this connection

3. **Specificity** (Is the connection too broad?)
   - [ ] The connection is specific enough to be useful
   - [ ] Not just "both are technology" or "both are nouns"
   - [ ] The relationship description is meaningful

4. **Directionality** (Does the relationship make sense both ways?)
   - [ ] Query for A -> retrieves B makes sense
   - [ ] Query for B -> retrieves A makes sense

## Red Flags to Watch For:
- Connections between entirely different domains (e.g., Climate -> Quantum)
- Very low similarity scores with high confidence (LLM hallucination?)
- Hub entities getting too many connections (becoming noise magnets)
- Circular clusters forming (A->B->C->A with no external connections)
"""


if __name__ == '__main__':
    print('Orphan Connection Quality Test Framework')
    print('=' * 50)
    print(f'Total test cases: {len(TEST_CASES)}')
    print(f'- Precision tests: {len([t for t in TEST_CASES if t.category == "precision"])}')
    print(f'- Recall tests: {len([t for t in TEST_CASES if t.category == "recall"])}')
    print(f'- Noise tests: {len([t for t in TEST_CASES if t.category == "noise"])}')
    print('\nRun with a LightRAG instance to execute tests.')
    print(EVALUATION_CHECKLIST)