Implement automatic orphan entity connection system that identifies entities with no relationships and creates meaningful connections via vector similarity + LLM validation. This improves knowledge graph connectivity and retrieval quality. Changes: - Add orphan connection configuration parameters (thresholds, cross-connect settings) - Implement aconnect_orphan_entities() method with 4-step validation pipeline - Add SQL templates for efficient orphan and candidate entity queries - Create POST /graph/orphans/connect API endpoint with configurable parameters - Add orphan connection validation prompt for LLM-based relationship verification - Include relationship density requirement in extraction prompts to prevent orphans - Update docker-compose.test.yml with optimized extraction parameters - Add quality validation test suite (run_quality_tests.py) for retrieval evaluation - Add unit test framework (test_orphan_connection_quality.py) with test cases - Enable auto-run of orphan connection after document processing
220 lines
9 KiB
Python
220 lines
9 KiB
Python
"""
|
|
Orphan Connection Quality Tests
|
|
|
|
Tests to validate that orphan connections improve (not poison) retrieval quality.
|
|
|
|
Test Categories:
|
|
1. Precision tests - Do orphan connections add relevant context?
|
|
2. Recall tests - Do orphan connections help find information that was missed?
|
|
3. Noise tests - Do orphan connections introduce irrelevant information?
|
|
4. A/B comparison - Same queries with/without connections
|
|
"""
|
|
|
|
import asyncio
|
|
import json
|
|
from dataclasses import dataclass
|
|
from typing import Optional
|
|
|
|
|
|
@dataclass
|
|
class QueryTestCase:
|
|
"""A test case for evaluating retrieval quality."""
|
|
query: str
|
|
expected_entities: list[str] # Entities that SHOULD be retrieved
|
|
unexpected_entities: list[str] # Entities that should NOT be retrieved
|
|
description: str
|
|
category: str # "precision", "recall", "noise"
|
|
|
|
|
|
# Test cases designed to evaluate orphan connection quality
|
|
TEST_CASES = [
|
|
# PRECISION TESTS - Do we retrieve the RIGHT things?
|
|
QueryTestCase(
|
|
query="What types of neural networks are used in deep learning?",
|
|
expected_entities=["Neural Networks", "Convolutional Neural Network",
|
|
"Recurrent Neural Network", "Transformer"],
|
|
unexpected_entities=["Quantum Computing", "Climate Change", "FDA"],
|
|
description="Should retrieve NN types via orphan connections (CNN->NN, RNN->NN)",
|
|
category="precision"
|
|
),
|
|
QueryTestCase(
|
|
query="What quantum computing hardware approaches exist?",
|
|
expected_entities=["Qubit", "Trapped Ions", "Superconducting Qubits",
|
|
"Photonic Qubits", "Topological Qubits", "IonQ"],
|
|
unexpected_entities=["Neural Networks", "Machine Learning", "Climate Change"],
|
|
description="Should retrieve qubit types via orphan connections",
|
|
category="precision"
|
|
),
|
|
|
|
# RECALL TESTS - Do we find things we would have MISSED without connections?
|
|
QueryTestCase(
|
|
query="What companies are working on quantum computing?",
|
|
expected_entities=["IonQ", "Microsoft", "Google", "IBM"],
|
|
unexpected_entities=[],
|
|
description="Should find IonQ (connected via Trapped Ions) and Microsoft (via Topological Qubits)",
|
|
category="recall"
|
|
),
|
|
QueryTestCase(
|
|
query="What are greenhouse gases?",
|
|
expected_entities=["Carbon Dioxide (CO2)", "Methane (CH4)", "Nitrous Oxide (N2O)",
|
|
"Fluorinated Gases"],
|
|
unexpected_entities=["Machine Learning", "Quantum Computing"],
|
|
description="Should retrieve all GHGs via orphan connections forming a cluster",
|
|
category="recall"
|
|
),
|
|
|
|
# NOISE TESTS - Do we retrieve IRRELEVANT things?
|
|
QueryTestCase(
|
|
query="What is reinforcement learning?",
|
|
expected_entities=["Reinforcement Learning", "Machine Learning"],
|
|
unexpected_entities=["Climate Change", "FDA", "Vehicle Emissions Standards"],
|
|
description="Should NOT pull in unrelated domains despite graph connectivity",
|
|
category="noise"
|
|
),
|
|
QueryTestCase(
|
|
query="How does computer vision work?",
|
|
expected_entities=["Computer Vision", "Image Segmentation", "Object Tracking",
|
|
"Feature Extraction", "Edge Detection"],
|
|
unexpected_entities=["Quantum Computing", "Climate Modeling", "Drug Discovery"],
|
|
description="Should retrieve CV techniques, not unrelated domains",
|
|
category="noise"
|
|
),
|
|
|
|
# EDGE CASE - Orphan connections shouldn't create nonsense pathways
|
|
QueryTestCase(
|
|
query="What is Amazon?",
|
|
expected_entities=["Amazon"],
|
|
unexpected_entities=[], # We connected Amazon -> Microsoft, is this causing issues?
|
|
description="Amazon query - check if connection to Microsoft causes retrieval issues",
|
|
category="noise"
|
|
),
|
|
]
|
|
|
|
|
|
async def run_query(rag, query: str, mode: str = "local") -> dict:
|
|
"""Run a query and return retrieved entities."""
|
|
# This would need to be adapted based on how LightRAG returns context
|
|
result = await rag.aquery(query, param={"mode": mode})
|
|
return result
|
|
|
|
|
|
async def evaluate_test_case(rag, test_case: QueryTestCase) -> dict:
|
|
"""Evaluate a single test case."""
|
|
result = await run_query(rag, test_case.query)
|
|
|
|
# Extract retrieved entities from result
|
|
# (Implementation depends on LightRAG response format)
|
|
retrieved_entities = [] # Parse from result
|
|
|
|
# Calculate metrics
|
|
expected_found = [e for e in test_case.expected_entities if e in retrieved_entities]
|
|
unexpected_found = [e for e in test_case.unexpected_entities if e in retrieved_entities]
|
|
|
|
precision = len(expected_found) / len(retrieved_entities) if retrieved_entities else 0
|
|
recall = len(expected_found) / len(test_case.expected_entities) if test_case.expected_entities else 1
|
|
noise_rate = len(unexpected_found) / len(retrieved_entities) if retrieved_entities else 0
|
|
|
|
return {
|
|
"test_case": test_case.description,
|
|
"category": test_case.category,
|
|
"query": test_case.query,
|
|
"expected_found": expected_found,
|
|
"expected_missed": [e for e in test_case.expected_entities if e not in retrieved_entities],
|
|
"unexpected_found": unexpected_found,
|
|
"precision": precision,
|
|
"recall": recall,
|
|
"noise_rate": noise_rate,
|
|
"pass": len(unexpected_found) == 0 and recall > 0.5
|
|
}
|
|
|
|
|
|
async def run_ab_comparison(rag_with_connections, rag_without_connections, query: str) -> dict:
|
|
"""
|
|
Compare retrieval results with and without orphan connections.
|
|
|
|
This requires two separate LightRAG instances:
|
|
- One with orphan connections applied
|
|
- One without (baseline)
|
|
"""
|
|
result_with = await run_query(rag_with_connections, query)
|
|
result_without = await run_query(rag_without_connections, query)
|
|
|
|
return {
|
|
"query": query,
|
|
"with_connections": result_with,
|
|
"without_connections": result_without,
|
|
"improved": None, # Human evaluation needed
|
|
}
|
|
|
|
|
|
def generate_test_report(results: list[dict]) -> str:
|
|
"""Generate a test report from evaluation results."""
|
|
report = ["# Orphan Connection Quality Test Report\n"]
|
|
|
|
# Summary by category
|
|
for category in ["precision", "recall", "noise"]:
|
|
cat_results = [r for r in results if r["category"] == category]
|
|
if cat_results:
|
|
passed = sum(1 for r in cat_results if r["pass"])
|
|
report.append(f"\n## {category.upper()} Tests: {passed}/{len(cat_results)} passed\n")
|
|
for r in cat_results:
|
|
status = "✅" if r["pass"] else "❌"
|
|
report.append(f"- {status} {r['test_case']}")
|
|
if r.get("unexpected_found"):
|
|
report.append(f" - ⚠️ Noise detected: {r['unexpected_found']}")
|
|
|
|
# Overall metrics
|
|
all_precision = [r["precision"] for r in results if r["precision"] is not None]
|
|
all_recall = [r["recall"] for r in results if r["recall"] is not None]
|
|
all_noise = [r["noise_rate"] for r in results if r["noise_rate"] is not None]
|
|
|
|
report.append(f"\n## Overall Metrics")
|
|
report.append(f"- Average Precision: {sum(all_precision)/len(all_precision):.2f}")
|
|
report.append(f"- Average Recall: {sum(all_recall)/len(all_recall):.2f}")
|
|
report.append(f"- Average Noise Rate: {sum(all_noise)/len(all_noise):.2f}")
|
|
|
|
return "\n".join(report)
|
|
|
|
|
|
# Manual evaluation checklist
|
|
EVALUATION_CHECKLIST = """
|
|
## Manual Evaluation Checklist
|
|
|
|
For each orphan connection, evaluate:
|
|
|
|
1. **Semantic Validity** (Is the connection logically correct?)
|
|
- [ ] The entities are genuinely related
|
|
- [ ] The relationship type makes sense
|
|
- [ ] A human expert would agree with this connection
|
|
|
|
2. **Retrieval Impact** (Does this help or hurt queries?)
|
|
- [ ] Queries about entity A now appropriately include entity B
|
|
- [ ] Queries about entity B now appropriately include entity A
|
|
- [ ] No unrelated queries are polluted by this connection
|
|
|
|
3. **Specificity** (Is the connection too broad?)
|
|
- [ ] The connection is specific enough to be useful
|
|
- [ ] Not just "both are technology" or "both are nouns"
|
|
- [ ] The relationship description is meaningful
|
|
|
|
4. **Directionality** (Does the relationship make sense both ways?)
|
|
- [ ] Query for A -> retrieves B makes sense
|
|
- [ ] Query for B -> retrieves A makes sense
|
|
|
|
## Red Flags to Watch For:
|
|
- Connections between entirely different domains (e.g., Climate -> Quantum)
|
|
- Very low similarity scores with high confidence (LLM hallucination?)
|
|
- Hub entities getting too many connections (becoming noise magnets)
|
|
- Circular clusters forming (A->B->C->A with no external connections)
|
|
"""
|
|
|
|
|
|
if __name__ == "__main__":
|
|
print("Orphan Connection Quality Test Framework")
|
|
print("=" * 50)
|
|
print(f"Total test cases: {len(TEST_CASES)}")
|
|
print(f"- Precision tests: {len([t for t in TEST_CASES if t.category == 'precision'])}")
|
|
print(f"- Recall tests: {len([t for t in TEST_CASES if t.category == 'recall'])}")
|
|
print(f"- Noise tests: {len([t for t in TEST_CASES if t.category == 'noise'])}")
|
|
print("\nRun with a LightRAG instance to execute tests.")
|
|
print(EVALUATION_CHECKLIST)
|