LightRAG/tests/test_orphan_connection_quality.py

"""
Orphan Connection Quality Tests

Tests to validate that orphan connections improve (not poison) retrieval quality.

Test Categories:
1. Precision tests - Do orphan connections add relevant context?
2. Recall tests - Do orphan connections help find information that was missed?
3. Noise tests - Do orphan connections introduce irrelevant information?
4. A/B comparison - Same queries with/without connections
"""

import asyncio
import json
from dataclasses import dataclass
from typing import Optional


@dataclass
class QueryTestCase:
    """A test case for evaluating retrieval quality."""
    query: str
    expected_entities: list[str]  # Entities that SHOULD be retrieved
    unexpected_entities: list[str]  # Entities that should NOT be retrieved
    description: str
    category: str  # "precision", "recall", "noise"


# Test cases designed to evaluate orphan connection quality
TEST_CASES = [
    # PRECISION TESTS - Do we retrieve the RIGHT things?
    QueryTestCase(
        query="What types of neural networks are used in deep learning?",
        expected_entities=["Neural Networks", "Convolutional Neural Network",
                          "Recurrent Neural Network", "Transformer"],
        unexpected_entities=["Quantum Computing", "Climate Change", "FDA"],
        description="Should retrieve NN types via orphan connections (CNN->NN, RNN->NN)",
        category="precision"
    ),
    QueryTestCase(
        query="What quantum computing hardware approaches exist?",
        expected_entities=["Qubit", "Trapped Ions", "Superconducting Qubits",
                          "Photonic Qubits", "Topological Qubits", "IonQ"],
        unexpected_entities=["Neural Networks", "Machine Learning", "Climate Change"],
        description="Should retrieve qubit types via orphan connections",
        category="precision"
    ),

    # RECALL TESTS - Do we find things we would have MISSED without connections?
    QueryTestCase(
        query="What companies are working on quantum computing?",
        expected_entities=["IonQ", "Microsoft", "Google", "IBM"],
        unexpected_entities=[],
        description="Should find IonQ (connected via Trapped Ions) and Microsoft (via Topological Qubits)",
        category="recall"
    ),
    QueryTestCase(
        query="What are greenhouse gases?",
        expected_entities=["Carbon Dioxide (CO2)", "Methane (CH4)", "Nitrous Oxide (N2O)",
                          "Fluorinated Gases"],
        unexpected_entities=["Machine Learning", "Quantum Computing"],
        description="Should retrieve all GHGs via orphan connections forming a cluster",
        category="recall"
    ),

    # NOISE TESTS - Do we retrieve IRRELEVANT things?
    QueryTestCase(
        query="What is reinforcement learning?",
        expected_entities=["Reinforcement Learning", "Machine Learning"],
        unexpected_entities=["Climate Change", "FDA", "Vehicle Emissions Standards"],
        description="Should NOT pull in unrelated domains despite graph connectivity",
        category="noise"
    ),
    QueryTestCase(
        query="How does computer vision work?",
        expected_entities=["Computer Vision", "Image Segmentation", "Object Tracking",
                          "Feature Extraction", "Edge Detection"],
        unexpected_entities=["Quantum Computing", "Climate Modeling", "Drug Discovery"],
        description="Should retrieve CV techniques, not unrelated domains",
        category="noise"
    ),

    # EDGE CASE - Orphan connections shouldn't create nonsense pathways
    QueryTestCase(
        query="What is Amazon?",
        expected_entities=["Amazon"],
        unexpected_entities=[],  # We connected Amazon -> Microsoft, is this causing issues?
        description="Amazon query - check if connection to Microsoft causes retrieval issues",
        category="noise"
    ),
]


async def run_query(rag, query: str, mode: str = "local") -> dict:
    """Run a query and return retrieved entities."""
    # This would need to be adapted based on how LightRAG returns context
    result = await rag.aquery(query, param={"mode": mode})
    return result


async def evaluate_test_case(rag, test_case: QueryTestCase) -> dict:
    """Evaluate a single test case."""
    result = await run_query(rag, test_case.query)

    # Extract retrieved entities from result
    # (Implementation depends on LightRAG response format)
    retrieved_entities = []  # Parse from result

    # Calculate metrics
    expected_found = [e for e in test_case.expected_entities if e in retrieved_entities]
    unexpected_found = [e for e in test_case.unexpected_entities if e in retrieved_entities]

    precision = len(expected_found) / len(retrieved_entities) if retrieved_entities else 0
    recall = len(expected_found) / len(test_case.expected_entities) if test_case.expected_entities else 1
    noise_rate = len(unexpected_found) / len(retrieved_entities) if retrieved_entities else 0

    return {
        "test_case": test_case.description,
        "category": test_case.category,
        "query": test_case.query,
        "expected_found": expected_found,
        "expected_missed": [e for e in test_case.expected_entities if e not in retrieved_entities],
        "unexpected_found": unexpected_found,
        "precision": precision,
        "recall": recall,
        "noise_rate": noise_rate,
        "pass": len(unexpected_found) == 0 and recall > 0.5
    }


async def run_ab_comparison(rag_with_connections, rag_without_connections, query: str) -> dict:
    """
    Compare retrieval results with and without orphan connections.

    This requires two separate LightRAG instances:
    - One with orphan connections applied
    - One without (baseline)
    """
    result_with = await run_query(rag_with_connections, query)
    result_without = await run_query(rag_without_connections, query)

    return {
        "query": query,
        "with_connections": result_with,
        "without_connections": result_without,
        "improved": None,  # Human evaluation needed
    }


def generate_test_report(results: list[dict]) -> str:
    """Generate a test report from evaluation results."""
    report = ["# Orphan Connection Quality Test Report\n"]

    # Summary by category
    for category in ["precision", "recall", "noise"]:
        cat_results = [r for r in results if r["category"] == category]
        if cat_results:
            passed = sum(1 for r in cat_results if r["pass"])
            report.append(f"\n## {category.upper()} Tests: {passed}/{len(cat_results)} passed\n")
            for r in cat_results:
                status = "✅" if r["pass"] else "❌"
                report.append(f"- {status} {r['test_case']}")
                if r.get("unexpected_found"):
                    report.append(f"  - ⚠️ Noise detected: {r['unexpected_found']}")

    # Overall metrics
    all_precision = [r["precision"] for r in results if r["precision"] is not None]
    all_recall = [r["recall"] for r in results if r["recall"] is not None]
    all_noise = [r["noise_rate"] for r in results if r["noise_rate"] is not None]

    report.append(f"\n## Overall Metrics")
    report.append(f"- Average Precision: {sum(all_precision)/len(all_precision):.2f}")
    report.append(f"- Average Recall: {sum(all_recall)/len(all_recall):.2f}")
    report.append(f"- Average Noise Rate: {sum(all_noise)/len(all_noise):.2f}")

    return "\n".join(report)


# Manual evaluation checklist
EVALUATION_CHECKLIST = """
## Manual Evaluation Checklist

For each orphan connection, evaluate:

1. **Semantic Validity** (Is the connection logically correct?)
   - [ ] The entities are genuinely related
   - [ ] The relationship type makes sense
   - [ ] A human expert would agree with this connection

2. **Retrieval Impact** (Does this help or hurt queries?)
   - [ ] Queries about entity A now appropriately include entity B
   - [ ] Queries about entity B now appropriately include entity A
   - [ ] No unrelated queries are polluted by this connection

3. **Specificity** (Is the connection too broad?)
   - [ ] The connection is specific enough to be useful
   - [ ] Not just "both are technology" or "both are nouns"
   - [ ] The relationship description is meaningful

4. **Directionality** (Does the relationship make sense both ways?)
   - [ ] Query for A -> retrieves B makes sense
   - [ ] Query for B -> retrieves A makes sense

## Red Flags to Watch For:
- Connections between entirely different domains (e.g., Climate -> Quantum)
- Very low similarity scores with high confidence (LLM hallucination?)
- Hub entities getting too many connections (becoming noise magnets)
- Circular clusters forming (A->B->C->A with no external connections)
"""


if __name__ == "__main__":
    print("Orphan Connection Quality Test Framework")
    print("=" * 50)
    print(f"Total test cases: {len(TEST_CASES)}")
    print(f"- Precision tests: {len([t for t in TEST_CASES if t.category == 'precision'])}")
    print(f"- Recall tests: {len([t for t in TEST_CASES if t.category == 'recall'])}")
    print(f"- Noise tests: {len([t for t in TEST_CASES if t.category == 'noise'])}")
    print("\nRun with a LightRAG instance to execute tests.")
    print(EVALUATION_CHECKLIST)