LightRAG/tests/test_orphan_connection_quality.py
clssck d2c9e6e2ec test(lightrag): add orphan connection feature with quality validation tests
Implement automatic orphan entity connection system that identifies entities with
no relationships and creates meaningful connections via vector similarity + LLM
validation. This improves knowledge graph connectivity and retrieval quality.
Changes:
- Add orphan connection configuration parameters (thresholds, cross-connect settings)
- Implement aconnect_orphan_entities() method with 4-step validation pipeline
- Add SQL templates for efficient orphan and candidate entity queries
- Create POST /graph/orphans/connect API endpoint with configurable parameters
- Add orphan connection validation prompt for LLM-based relationship verification
- Include relationship density requirement in extraction prompts to prevent orphans
- Update docker-compose.test.yml with optimized extraction parameters
- Add quality validation test suite (run_quality_tests.py) for retrieval evaluation
- Add unit test framework (test_orphan_connection_quality.py) with test cases
- Enable auto-run of orphan connection after document processing
2025-11-28 18:23:30 +01:00

220 lines
9 KiB
Python

"""
Orphan Connection Quality Tests
Tests to validate that orphan connections improve (not poison) retrieval quality.
Test Categories:
1. Precision tests - Do orphan connections add relevant context?
2. Recall tests - Do orphan connections help find information that was missed?
3. Noise tests - Do orphan connections introduce irrelevant information?
4. A/B comparison - Same queries with/without connections
"""
import asyncio
import json
from dataclasses import dataclass
from typing import Optional
@dataclass
class QueryTestCase:
"""A test case for evaluating retrieval quality."""
query: str
expected_entities: list[str] # Entities that SHOULD be retrieved
unexpected_entities: list[str] # Entities that should NOT be retrieved
description: str
category: str # "precision", "recall", "noise"
# Test cases designed to evaluate orphan connection quality
TEST_CASES = [
# PRECISION TESTS - Do we retrieve the RIGHT things?
QueryTestCase(
query="What types of neural networks are used in deep learning?",
expected_entities=["Neural Networks", "Convolutional Neural Network",
"Recurrent Neural Network", "Transformer"],
unexpected_entities=["Quantum Computing", "Climate Change", "FDA"],
description="Should retrieve NN types via orphan connections (CNN->NN, RNN->NN)",
category="precision"
),
QueryTestCase(
query="What quantum computing hardware approaches exist?",
expected_entities=["Qubit", "Trapped Ions", "Superconducting Qubits",
"Photonic Qubits", "Topological Qubits", "IonQ"],
unexpected_entities=["Neural Networks", "Machine Learning", "Climate Change"],
description="Should retrieve qubit types via orphan connections",
category="precision"
),
# RECALL TESTS - Do we find things we would have MISSED without connections?
QueryTestCase(
query="What companies are working on quantum computing?",
expected_entities=["IonQ", "Microsoft", "Google", "IBM"],
unexpected_entities=[],
description="Should find IonQ (connected via Trapped Ions) and Microsoft (via Topological Qubits)",
category="recall"
),
QueryTestCase(
query="What are greenhouse gases?",
expected_entities=["Carbon Dioxide (CO2)", "Methane (CH4)", "Nitrous Oxide (N2O)",
"Fluorinated Gases"],
unexpected_entities=["Machine Learning", "Quantum Computing"],
description="Should retrieve all GHGs via orphan connections forming a cluster",
category="recall"
),
# NOISE TESTS - Do we retrieve IRRELEVANT things?
QueryTestCase(
query="What is reinforcement learning?",
expected_entities=["Reinforcement Learning", "Machine Learning"],
unexpected_entities=["Climate Change", "FDA", "Vehicle Emissions Standards"],
description="Should NOT pull in unrelated domains despite graph connectivity",
category="noise"
),
QueryTestCase(
query="How does computer vision work?",
expected_entities=["Computer Vision", "Image Segmentation", "Object Tracking",
"Feature Extraction", "Edge Detection"],
unexpected_entities=["Quantum Computing", "Climate Modeling", "Drug Discovery"],
description="Should retrieve CV techniques, not unrelated domains",
category="noise"
),
# EDGE CASE - Orphan connections shouldn't create nonsense pathways
QueryTestCase(
query="What is Amazon?",
expected_entities=["Amazon"],
unexpected_entities=[], # We connected Amazon -> Microsoft, is this causing issues?
description="Amazon query - check if connection to Microsoft causes retrieval issues",
category="noise"
),
]
async def run_query(rag, query: str, mode: str = "local") -> dict:
"""Run a query and return retrieved entities."""
# This would need to be adapted based on how LightRAG returns context
result = await rag.aquery(query, param={"mode": mode})
return result
async def evaluate_test_case(rag, test_case: QueryTestCase) -> dict:
"""Evaluate a single test case."""
result = await run_query(rag, test_case.query)
# Extract retrieved entities from result
# (Implementation depends on LightRAG response format)
retrieved_entities = [] # Parse from result
# Calculate metrics
expected_found = [e for e in test_case.expected_entities if e in retrieved_entities]
unexpected_found = [e for e in test_case.unexpected_entities if e in retrieved_entities]
precision = len(expected_found) / len(retrieved_entities) if retrieved_entities else 0
recall = len(expected_found) / len(test_case.expected_entities) if test_case.expected_entities else 1
noise_rate = len(unexpected_found) / len(retrieved_entities) if retrieved_entities else 0
return {
"test_case": test_case.description,
"category": test_case.category,
"query": test_case.query,
"expected_found": expected_found,
"expected_missed": [e for e in test_case.expected_entities if e not in retrieved_entities],
"unexpected_found": unexpected_found,
"precision": precision,
"recall": recall,
"noise_rate": noise_rate,
"pass": len(unexpected_found) == 0 and recall > 0.5
}
async def run_ab_comparison(rag_with_connections, rag_without_connections, query: str) -> dict:
"""
Compare retrieval results with and without orphan connections.
This requires two separate LightRAG instances:
- One with orphan connections applied
- One without (baseline)
"""
result_with = await run_query(rag_with_connections, query)
result_without = await run_query(rag_without_connections, query)
return {
"query": query,
"with_connections": result_with,
"without_connections": result_without,
"improved": None, # Human evaluation needed
}
def generate_test_report(results: list[dict]) -> str:
"""Generate a test report from evaluation results."""
report = ["# Orphan Connection Quality Test Report\n"]
# Summary by category
for category in ["precision", "recall", "noise"]:
cat_results = [r for r in results if r["category"] == category]
if cat_results:
passed = sum(1 for r in cat_results if r["pass"])
report.append(f"\n## {category.upper()} Tests: {passed}/{len(cat_results)} passed\n")
for r in cat_results:
status = "" if r["pass"] else ""
report.append(f"- {status} {r['test_case']}")
if r.get("unexpected_found"):
report.append(f" - ⚠️ Noise detected: {r['unexpected_found']}")
# Overall metrics
all_precision = [r["precision"] for r in results if r["precision"] is not None]
all_recall = [r["recall"] for r in results if r["recall"] is not None]
all_noise = [r["noise_rate"] for r in results if r["noise_rate"] is not None]
report.append(f"\n## Overall Metrics")
report.append(f"- Average Precision: {sum(all_precision)/len(all_precision):.2f}")
report.append(f"- Average Recall: {sum(all_recall)/len(all_recall):.2f}")
report.append(f"- Average Noise Rate: {sum(all_noise)/len(all_noise):.2f}")
return "\n".join(report)
# Manual evaluation checklist
EVALUATION_CHECKLIST = """
## Manual Evaluation Checklist
For each orphan connection, evaluate:
1. **Semantic Validity** (Is the connection logically correct?)
- [ ] The entities are genuinely related
- [ ] The relationship type makes sense
- [ ] A human expert would agree with this connection
2. **Retrieval Impact** (Does this help or hurt queries?)
- [ ] Queries about entity A now appropriately include entity B
- [ ] Queries about entity B now appropriately include entity A
- [ ] No unrelated queries are polluted by this connection
3. **Specificity** (Is the connection too broad?)
- [ ] The connection is specific enough to be useful
- [ ] Not just "both are technology" or "both are nouns"
- [ ] The relationship description is meaningful
4. **Directionality** (Does the relationship make sense both ways?)
- [ ] Query for A -> retrieves B makes sense
- [ ] Query for B -> retrieves A makes sense
## Red Flags to Watch For:
- Connections between entirely different domains (e.g., Climate -> Quantum)
- Very low similarity scores with high confidence (LLM hallucination?)
- Hub entities getting too many connections (becoming noise magnets)
- Circular clusters forming (A->B->C->A with no external connections)
"""
if __name__ == "__main__":
print("Orphan Connection Quality Test Framework")
print("=" * 50)
print(f"Total test cases: {len(TEST_CASES)}")
print(f"- Precision tests: {len([t for t in TEST_CASES if t.category == 'precision'])}")
print(f"- Recall tests: {len([t for t in TEST_CASES if t.category == 'recall'])}")
print(f"- Noise tests: {len([t for t in TEST_CASES if t.category == 'noise'])}")
print("\nRun with a LightRAG instance to execute tests.")
print(EVALUATION_CHECKLIST)