chore(docker-compose, lightrag): optimize test infrastructure and add evaluation tools
Add comprehensive E2E testing infrastructure with PostgreSQL performance tuning, Gunicorn multi-worker support, and evaluation scripts for RAGAS-based quality assessment. Introduces 4 new evaluation utilities: compare_results.py for A/B test analysis, download_wikipedia.py for reproducible test datasets, e2e_test_harness.py for automated evaluation pipelines, and ingest_test_docs.py for batch document ingestion. Updates docker-compose.test.yml with aggressive async settings, memory limits, and optimized chunking parameters. Parallelize entity summarization in operate.py for improved extraction performance. Fix typos in merge node/edge logs.
This commit is contained in:
parent
d2c9e6e2ec
commit
ef7327bb3e
7 changed files with 1311 additions and 11 deletions
|
|
@ -14,11 +14,35 @@ services:
|
||||||
- "5433:5432" # Use 5433 to avoid conflict with agent-sdk postgres
|
- "5433:5432" # Use 5433 to avoid conflict with agent-sdk postgres
|
||||||
volumes:
|
volumes:
|
||||||
- pgdata_test:/var/lib/postgresql/data
|
- pgdata_test:/var/lib/postgresql/data
|
||||||
|
command: |
|
||||||
|
postgres
|
||||||
|
-c shared_preload_libraries='vector,age'
|
||||||
|
-c max_connections=150
|
||||||
|
-c shared_buffers=768MB
|
||||||
|
-c work_mem=32MB
|
||||||
|
-c checkpoint_completion_target=0.9
|
||||||
|
-c effective_cache_size=2GB
|
||||||
|
-c maintenance_work_mem=192MB
|
||||||
|
-c wal_compression=on
|
||||||
|
-c checkpoint_timeout=10min
|
||||||
|
-c max_wal_size=1GB
|
||||||
|
-c random_page_cost=1.1
|
||||||
|
-c effective_io_concurrency=200
|
||||||
|
-c max_worker_processes=12
|
||||||
|
-c max_parallel_workers_per_gather=4
|
||||||
|
-c max_parallel_workers=8
|
||||||
|
-c max_parallel_maintenance_workers=4
|
||||||
|
-c jit_above_cost=50000
|
||||||
|
-c jit_inline_above_cost=250000
|
||||||
|
-c jit_optimize_above_cost=250000
|
||||||
|
-c default_statistics_target=200
|
||||||
|
-c hash_mem_multiplier=4
|
||||||
healthcheck:
|
healthcheck:
|
||||||
test: ["CMD-SHELL", "pg_isready -U lightrag -d lightrag"]
|
test: ["CMD-SHELL", "pg_isready -U lightrag -d lightrag"]
|
||||||
interval: 5s
|
interval: 5s
|
||||||
timeout: 5s
|
timeout: 5s
|
||||||
retries: 5
|
retries: 5
|
||||||
|
mem_limit: 2g
|
||||||
|
|
||||||
lightrag:
|
lightrag:
|
||||||
container_name: lightrag-test
|
container_name: lightrag-test
|
||||||
|
|
@ -67,8 +91,14 @@ services:
|
||||||
- ENTITY_RESOLUTION_VECTOR_THRESHOLD=0.5
|
- ENTITY_RESOLUTION_VECTOR_THRESHOLD=0.5
|
||||||
- ENTITY_RESOLUTION_MAX_CANDIDATES=3
|
- ENTITY_RESOLUTION_MAX_CANDIDATES=3
|
||||||
|
|
||||||
# Processing
|
# Processing - Aggressive settings from agent-sdk
|
||||||
- MAX_ASYNC=4
|
- MAX_ASYNC=96
|
||||||
|
- MAX_PARALLEL_INSERT=10
|
||||||
|
- EMBEDDING_FUNC_MAX_ASYNC=16
|
||||||
|
- EMBEDDING_BATCH_NUM=48
|
||||||
|
|
||||||
|
# Gunicorn - 8 workers x 4 threads = 32 concurrent handlers
|
||||||
|
- GUNICORN_CMD_ARGS=--workers=8 --worker-class=gthread --threads=4 --worker-connections=1000 --timeout=120 --keep-alive=5 --graceful-timeout=30
|
||||||
|
|
||||||
# Extraction Optimization - Reduce Orphan Nodes
|
# Extraction Optimization - Reduce Orphan Nodes
|
||||||
- CHUNK_SIZE=800 # Smaller chunks for focused extraction
|
- CHUNK_SIZE=800 # Smaller chunks for focused extraction
|
||||||
|
|
@ -84,12 +114,23 @@ services:
|
||||||
depends_on:
|
depends_on:
|
||||||
postgres:
|
postgres:
|
||||||
condition: service_healthy
|
condition: service_healthy
|
||||||
|
entrypoint: []
|
||||||
|
command:
|
||||||
|
- python
|
||||||
|
- /app/lightrag/api/run_with_gunicorn.py
|
||||||
|
- --workers
|
||||||
|
- "8"
|
||||||
|
- --llm-binding
|
||||||
|
- openai
|
||||||
|
- --embedding-binding
|
||||||
|
- openai
|
||||||
healthcheck:
|
healthcheck:
|
||||||
test: ["CMD-SHELL", "curl -f http://localhost:9621/health || exit 1"]
|
test: ["CMD-SHELL", "curl -f http://localhost:9621/health || exit 1"]
|
||||||
interval: 10s
|
interval: 10s
|
||||||
timeout: 5s
|
timeout: 5s
|
||||||
retries: 10
|
retries: 10
|
||||||
start_period: 30s
|
start_period: 60s
|
||||||
|
mem_limit: 2g
|
||||||
|
|
||||||
volumes:
|
volumes:
|
||||||
pgdata_test:
|
pgdata_test:
|
||||||
|
|
|
||||||
322
lightrag/evaluation/compare_results.py
Normal file
322
lightrag/evaluation/compare_results.py
Normal file
|
|
@ -0,0 +1,322 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
A/B Test Results Comparator for RAGAS Evaluation
|
||||||
|
|
||||||
|
Compares two RAGAS evaluation result files to determine if a change
|
||||||
|
(e.g., orphan connections) improved or degraded retrieval quality.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python lightrag/evaluation/compare_results.py baseline.json experiment.json
|
||||||
|
python lightrag/evaluation/compare_results.py results_a.json results_b.json --output comparison.json
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import math
|
||||||
|
import sys
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from datetime import datetime
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class MetricComparison:
|
||||||
|
"""Comparison of a single metric between two runs."""
|
||||||
|
metric_name: str
|
||||||
|
baseline_value: float
|
||||||
|
experiment_value: float
|
||||||
|
absolute_change: float
|
||||||
|
relative_change_percent: float
|
||||||
|
improved: bool
|
||||||
|
significant: bool # > 5% change
|
||||||
|
|
||||||
|
|
||||||
|
def safe_float(value: Any, default: float = 0.0) -> float:
|
||||||
|
"""Safely convert a value to float, handling NaN."""
|
||||||
|
if value is None:
|
||||||
|
return default
|
||||||
|
try:
|
||||||
|
f = float(value)
|
||||||
|
if math.isnan(f):
|
||||||
|
return default
|
||||||
|
return f
|
||||||
|
except (ValueError, TypeError):
|
||||||
|
return default
|
||||||
|
|
||||||
|
|
||||||
|
def compare_metrics(baseline: dict, experiment: dict) -> list[MetricComparison]:
|
||||||
|
"""
|
||||||
|
Compare metrics between baseline and experiment.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
baseline: Benchmark stats from baseline run
|
||||||
|
experiment: Benchmark stats from experiment run
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of MetricComparison objects
|
||||||
|
"""
|
||||||
|
comparisons = []
|
||||||
|
|
||||||
|
baseline_avg = baseline.get("average_metrics", {})
|
||||||
|
experiment_avg = experiment.get("average_metrics", {})
|
||||||
|
|
||||||
|
metrics_to_compare = [
|
||||||
|
("faithfulness", "Faithfulness"),
|
||||||
|
("answer_relevance", "Answer Relevance"),
|
||||||
|
("context_recall", "Context Recall"),
|
||||||
|
("context_precision", "Context Precision"),
|
||||||
|
("ragas_score", "RAGAS Score"),
|
||||||
|
]
|
||||||
|
|
||||||
|
for metric_key, metric_name in metrics_to_compare:
|
||||||
|
b_val = safe_float(baseline_avg.get(metric_key, 0))
|
||||||
|
e_val = safe_float(experiment_avg.get(metric_key, 0))
|
||||||
|
|
||||||
|
abs_change = e_val - b_val
|
||||||
|
rel_change = (abs_change / b_val * 100) if b_val > 0 else 0
|
||||||
|
|
||||||
|
comparisons.append(MetricComparison(
|
||||||
|
metric_name=metric_name,
|
||||||
|
baseline_value=b_val,
|
||||||
|
experiment_value=e_val,
|
||||||
|
absolute_change=abs_change,
|
||||||
|
relative_change_percent=rel_change,
|
||||||
|
improved=abs_change > 0,
|
||||||
|
significant=abs(rel_change) > 5, # > 5% is significant
|
||||||
|
))
|
||||||
|
|
||||||
|
return comparisons
|
||||||
|
|
||||||
|
|
||||||
|
def analyze_results(baseline_path: Path, experiment_path: Path) -> dict:
|
||||||
|
"""
|
||||||
|
Perform comprehensive A/B analysis.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
baseline_path: Path to baseline results JSON
|
||||||
|
experiment_path: Path to experiment results JSON
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Analysis results dictionary
|
||||||
|
"""
|
||||||
|
# Load results
|
||||||
|
with open(baseline_path) as f:
|
||||||
|
baseline = json.load(f)
|
||||||
|
with open(experiment_path) as f:
|
||||||
|
experiment = json.load(f)
|
||||||
|
|
||||||
|
baseline_stats = baseline.get("benchmark_stats", {})
|
||||||
|
experiment_stats = experiment.get("benchmark_stats", {})
|
||||||
|
|
||||||
|
# Compare metrics
|
||||||
|
comparisons = compare_metrics(baseline_stats, experiment_stats)
|
||||||
|
|
||||||
|
# Calculate overall verdict
|
||||||
|
improvements = sum(1 for c in comparisons if c.improved)
|
||||||
|
regressions = sum(1 for c in comparisons if not c.improved and c.absolute_change != 0)
|
||||||
|
significant_improvements = sum(1 for c in comparisons if c.improved and c.significant)
|
||||||
|
significant_regressions = sum(1 for c in comparisons if not c.improved and c.significant)
|
||||||
|
|
||||||
|
# Determine verdict
|
||||||
|
ragas_comparison = next((c for c in comparisons if c.metric_name == "RAGAS Score"), None)
|
||||||
|
|
||||||
|
if ragas_comparison:
|
||||||
|
if ragas_comparison.improved and ragas_comparison.significant:
|
||||||
|
verdict = "SIGNIFICANT_IMPROVEMENT"
|
||||||
|
verdict_description = f"RAGAS Score improved by {ragas_comparison.relative_change_percent:.1f}%"
|
||||||
|
elif ragas_comparison.improved:
|
||||||
|
verdict = "MINOR_IMPROVEMENT"
|
||||||
|
verdict_description = f"RAGAS Score slightly improved by {ragas_comparison.relative_change_percent:.1f}%"
|
||||||
|
elif ragas_comparison.significant:
|
||||||
|
verdict = "SIGNIFICANT_REGRESSION"
|
||||||
|
verdict_description = f"RAGAS Score regressed by {abs(ragas_comparison.relative_change_percent):.1f}%"
|
||||||
|
elif ragas_comparison.absolute_change == 0:
|
||||||
|
verdict = "NO_CHANGE"
|
||||||
|
verdict_description = "No measurable difference between runs"
|
||||||
|
else:
|
||||||
|
verdict = "MINOR_REGRESSION"
|
||||||
|
verdict_description = f"RAGAS Score slightly regressed by {abs(ragas_comparison.relative_change_percent):.1f}%"
|
||||||
|
else:
|
||||||
|
verdict = "UNKNOWN"
|
||||||
|
verdict_description = "Could not determine RAGAS score comparison"
|
||||||
|
|
||||||
|
return {
|
||||||
|
"analysis_timestamp": datetime.now().isoformat(),
|
||||||
|
"baseline_file": str(baseline_path),
|
||||||
|
"experiment_file": str(experiment_path),
|
||||||
|
"verdict": verdict,
|
||||||
|
"verdict_description": verdict_description,
|
||||||
|
"summary": {
|
||||||
|
"metrics_improved": improvements,
|
||||||
|
"metrics_regressed": regressions,
|
||||||
|
"significant_improvements": significant_improvements,
|
||||||
|
"significant_regressions": significant_regressions,
|
||||||
|
},
|
||||||
|
"metrics": [
|
||||||
|
{
|
||||||
|
"name": c.metric_name,
|
||||||
|
"baseline": round(c.baseline_value, 4),
|
||||||
|
"experiment": round(c.experiment_value, 4),
|
||||||
|
"change": round(c.absolute_change, 4),
|
||||||
|
"change_percent": round(c.relative_change_percent, 2),
|
||||||
|
"improved": c.improved,
|
||||||
|
"significant": c.significant,
|
||||||
|
}
|
||||||
|
for c in comparisons
|
||||||
|
],
|
||||||
|
"baseline_summary": {
|
||||||
|
"total_tests": baseline_stats.get("total_tests", 0),
|
||||||
|
"successful_tests": baseline_stats.get("successful_tests", 0),
|
||||||
|
"success_rate": baseline_stats.get("success_rate", 0),
|
||||||
|
},
|
||||||
|
"experiment_summary": {
|
||||||
|
"total_tests": experiment_stats.get("total_tests", 0),
|
||||||
|
"successful_tests": experiment_stats.get("successful_tests", 0),
|
||||||
|
"success_rate": experiment_stats.get("success_rate", 0),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def print_comparison_report(analysis: dict):
|
||||||
|
"""Print a formatted comparison report to stdout."""
|
||||||
|
print("=" * 70)
|
||||||
|
print("A/B TEST COMPARISON REPORT")
|
||||||
|
print("=" * 70)
|
||||||
|
print(f"Baseline: {analysis['baseline_file']}")
|
||||||
|
print(f"Experiment: {analysis['experiment_file']}")
|
||||||
|
print("-" * 70)
|
||||||
|
|
||||||
|
# Verdict
|
||||||
|
verdict = analysis["verdict"]
|
||||||
|
verdict_icon = {
|
||||||
|
"SIGNIFICANT_IMPROVEMENT": "PASS",
|
||||||
|
"MINOR_IMPROVEMENT": "PASS",
|
||||||
|
"NO_CHANGE": "~",
|
||||||
|
"MINOR_REGRESSION": "WARN",
|
||||||
|
"SIGNIFICANT_REGRESSION": "FAIL",
|
||||||
|
"UNKNOWN": "?",
|
||||||
|
}.get(verdict, "?")
|
||||||
|
|
||||||
|
print(f"\n[{verdict_icon}] VERDICT: {verdict}")
|
||||||
|
print(f" {analysis['verdict_description']}")
|
||||||
|
|
||||||
|
# Metrics table
|
||||||
|
print("\n" + "-" * 70)
|
||||||
|
print(f"{'Metric':<20} {'Baseline':>10} {'Experiment':>10} {'Change':>10} {'Status':>10}")
|
||||||
|
print("-" * 70)
|
||||||
|
|
||||||
|
for metric in analysis["metrics"]:
|
||||||
|
name = metric["name"]
|
||||||
|
baseline = f"{metric['baseline']:.4f}"
|
||||||
|
experiment = f"{metric['experiment']:.4f}"
|
||||||
|
|
||||||
|
change = metric["change"]
|
||||||
|
change_pct = metric["change_percent"]
|
||||||
|
if change > 0:
|
||||||
|
change_str = f"+{change:.4f}"
|
||||||
|
status = f"+{change_pct:.1f}%"
|
||||||
|
elif change < 0:
|
||||||
|
change_str = f"{change:.4f}"
|
||||||
|
status = f"{change_pct:.1f}%"
|
||||||
|
else:
|
||||||
|
change_str = "0.0000"
|
||||||
|
status = "0.0%"
|
||||||
|
|
||||||
|
if metric["significant"]:
|
||||||
|
if metric["improved"]:
|
||||||
|
status = f"[UP] {status}"
|
||||||
|
else:
|
||||||
|
status = f"[DOWN] {status}"
|
||||||
|
else:
|
||||||
|
status = f" {status}"
|
||||||
|
|
||||||
|
print(f"{name:<20} {baseline:>10} {experiment:>10} {change_str:>10} {status:>10}")
|
||||||
|
|
||||||
|
print("-" * 70)
|
||||||
|
|
||||||
|
# Summary
|
||||||
|
summary = analysis["summary"]
|
||||||
|
print(f"\nSummary: {summary['metrics_improved']} improved, {summary['metrics_regressed']} regressed")
|
||||||
|
print(f" {summary['significant_improvements']} significant improvements, {summary['significant_regressions']} significant regressions")
|
||||||
|
|
||||||
|
# Test counts
|
||||||
|
b_summary = analysis["baseline_summary"]
|
||||||
|
e_summary = analysis["experiment_summary"]
|
||||||
|
print(f"\nBaseline: {b_summary['successful_tests']}/{b_summary['total_tests']} tests ({b_summary['success_rate']:.1f}% success)")
|
||||||
|
print(f"Experiment: {e_summary['successful_tests']}/{e_summary['total_tests']} tests ({e_summary['success_rate']:.1f}% success)")
|
||||||
|
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Compare RAGAS evaluation results from two runs",
|
||||||
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||||
|
epilog="""
|
||||||
|
Examples:
|
||||||
|
# Compare baseline vs experiment
|
||||||
|
python lightrag/evaluation/compare_results.py baseline.json experiment.json
|
||||||
|
|
||||||
|
# Save comparison to file
|
||||||
|
python lightrag/evaluation/compare_results.py baseline.json experiment.json --output comparison.json
|
||||||
|
|
||||||
|
# Compare with/without orphan connections
|
||||||
|
python lightrag/evaluation/compare_results.py results_without_orphans.json results_with_orphans.json
|
||||||
|
""",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"baseline",
|
||||||
|
type=str,
|
||||||
|
help="Path to baseline results JSON file",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"experiment",
|
||||||
|
type=str,
|
||||||
|
help="Path to experiment results JSON file",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--output",
|
||||||
|
"-o",
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help="Output path for comparison JSON (optional)",
|
||||||
|
)
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
baseline_path = Path(args.baseline)
|
||||||
|
experiment_path = Path(args.experiment)
|
||||||
|
|
||||||
|
# Validate files exist
|
||||||
|
if not baseline_path.exists():
|
||||||
|
print(f"Error: Baseline file not found: {baseline_path}")
|
||||||
|
sys.exit(1)
|
||||||
|
if not experiment_path.exists():
|
||||||
|
print(f"Error: Experiment file not found: {experiment_path}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# Run analysis
|
||||||
|
analysis = analyze_results(baseline_path, experiment_path)
|
||||||
|
|
||||||
|
# Print report
|
||||||
|
print_comparison_report(analysis)
|
||||||
|
|
||||||
|
# Save to file if requested
|
||||||
|
if args.output:
|
||||||
|
output_path = Path(args.output)
|
||||||
|
with open(output_path, "w") as f:
|
||||||
|
json.dump(analysis, f, indent=2)
|
||||||
|
print(f"\nComparison saved to: {output_path}")
|
||||||
|
|
||||||
|
# Exit with status based on verdict
|
||||||
|
if analysis["verdict"] in ("SIGNIFICANT_REGRESSION",):
|
||||||
|
sys.exit(1)
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
183
lightrag/evaluation/download_wikipedia.py
Normal file
183
lightrag/evaluation/download_wikipedia.py
Normal file
|
|
@ -0,0 +1,183 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Download Wikipedia articles for LightRAG ingestion testing.
|
||||||
|
|
||||||
|
This script fetches plain text from Wikipedia articles across diverse domains
|
||||||
|
to create a test dataset with intentional entity overlap for testing:
|
||||||
|
- Entity merging and summarization
|
||||||
|
- Cross-domain relationships
|
||||||
|
- Parallel processing optimizations
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python lightrag/evaluation/download_wikipedia.py
|
||||||
|
python lightrag/evaluation/download_wikipedia.py --output wiki_docs/
|
||||||
|
python lightrag/evaluation/download_wikipedia.py --domains medical,climate
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import asyncio
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
|
||||||
|
# Wikipedia API endpoint (no auth required)
|
||||||
|
WIKI_API = "https://en.wikipedia.org/w/api.php"
|
||||||
|
|
||||||
|
# User-Agent required by Wikipedia API policy
|
||||||
|
# See: https://meta.wikimedia.org/wiki/User-Agent_policy
|
||||||
|
USER_AGENT = "LightRAG-Test-Downloader/1.0 (https://github.com/HKUDS/LightRAG; claude@example.com)"
|
||||||
|
|
||||||
|
# Article selection by domain - chosen for entity overlap
|
||||||
|
# WHO → Medical + Climate
|
||||||
|
# Carbon/Emissions → Climate + Finance (ESG)
|
||||||
|
# Germany/Brazil → Sports + general knowledge
|
||||||
|
ARTICLES = {
|
||||||
|
"medical": ["Diabetes", "COVID-19"],
|
||||||
|
"finance": ["Stock_market", "Cryptocurrency"],
|
||||||
|
"climate": ["Climate_change", "Renewable_energy"],
|
||||||
|
"sports": ["FIFA_World_Cup", "Olympic_Games"],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
async def fetch_article(title: str, client: httpx.AsyncClient) -> dict | None:
|
||||||
|
"""Fetch Wikipedia article text via API.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
title: Wikipedia article title (use underscores for spaces)
|
||||||
|
client: Async HTTP client
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict with title, content, and source; or None if not found
|
||||||
|
"""
|
||||||
|
params = {
|
||||||
|
"action": "query",
|
||||||
|
"titles": title,
|
||||||
|
"prop": "extracts",
|
||||||
|
"explaintext": True, # Plain text, no HTML
|
||||||
|
"format": "json",
|
||||||
|
}
|
||||||
|
response = await client.get(WIKI_API, params=params)
|
||||||
|
|
||||||
|
# Check for HTTP errors
|
||||||
|
if response.status_code != 200:
|
||||||
|
print(f" HTTP {response.status_code} for {title}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Handle empty response
|
||||||
|
if not response.content:
|
||||||
|
print(f" Empty response for {title}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
data = response.json()
|
||||||
|
except Exception as e:
|
||||||
|
print(f" JSON parse error for {title}: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
pages = data.get("query", {}).get("pages", {})
|
||||||
|
|
||||||
|
for page_id, page in pages.items():
|
||||||
|
if page_id != "-1": # -1 = not found
|
||||||
|
return {
|
||||||
|
"title": page.get("title", title),
|
||||||
|
"content": page.get("extract", ""),
|
||||||
|
"source": f"wikipedia_{title}",
|
||||||
|
}
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
async def download_articles(
|
||||||
|
domains: list[str],
|
||||||
|
output_dir: Path,
|
||||||
|
) -> list[dict]:
|
||||||
|
"""Download all articles for selected domains.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
domains: List of domain names (e.g., ["medical", "climate"])
|
||||||
|
output_dir: Directory to save downloaded articles
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of article metadata dicts
|
||||||
|
"""
|
||||||
|
output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
articles = []
|
||||||
|
|
||||||
|
headers = {"User-Agent": USER_AGENT}
|
||||||
|
async with httpx.AsyncClient(timeout=30.0, headers=headers) as client:
|
||||||
|
for domain in domains:
|
||||||
|
titles = ARTICLES.get(domain, [])
|
||||||
|
if not titles:
|
||||||
|
print(f"[{domain.upper()}] Unknown domain, skipping")
|
||||||
|
continue
|
||||||
|
|
||||||
|
print(f"[{domain.upper()}] Downloading {len(titles)} articles...")
|
||||||
|
|
||||||
|
for title in titles:
|
||||||
|
article = await fetch_article(title, client)
|
||||||
|
if article:
|
||||||
|
# Save to file
|
||||||
|
filename = f"{domain}_{title.lower().replace(' ', '_')}.txt"
|
||||||
|
filepath = output_dir / filename
|
||||||
|
filepath.write_text(article["content"])
|
||||||
|
|
||||||
|
word_count = len(article["content"].split())
|
||||||
|
print(f" ✓ {title}: {word_count:,} words")
|
||||||
|
|
||||||
|
articles.append(
|
||||||
|
{
|
||||||
|
"domain": domain,
|
||||||
|
"title": article["title"],
|
||||||
|
"file": str(filepath),
|
||||||
|
"words": word_count,
|
||||||
|
"source": article["source"],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
print(f" ✗ {title}: Not found")
|
||||||
|
|
||||||
|
return articles
|
||||||
|
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
parser = argparse.ArgumentParser(description="Download Wikipedia test articles")
|
||||||
|
parser.add_argument(
|
||||||
|
"--output",
|
||||||
|
"-o",
|
||||||
|
type=str,
|
||||||
|
default="lightrag/evaluation/wiki_documents",
|
||||||
|
help="Output directory for downloaded articles",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--domains",
|
||||||
|
"-d",
|
||||||
|
type=str,
|
||||||
|
default="medical,finance,climate,sports",
|
||||||
|
help="Comma-separated domains to download",
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
domains = [d.strip() for d in args.domains.split(",")]
|
||||||
|
output_dir = Path(args.output)
|
||||||
|
|
||||||
|
print("=== Wikipedia Article Downloader ===")
|
||||||
|
print(f"Domains: {', '.join(domains)}")
|
||||||
|
print(f"Output: {output_dir}/")
|
||||||
|
print()
|
||||||
|
|
||||||
|
articles = await download_articles(domains, output_dir)
|
||||||
|
|
||||||
|
total_words = sum(a["words"] for a in articles)
|
||||||
|
print()
|
||||||
|
print(f"✓ Downloaded {len(articles)} articles ({total_words:,} words total)")
|
||||||
|
print(f" Output: {output_dir}/")
|
||||||
|
|
||||||
|
# Print summary by domain
|
||||||
|
print("\nBy domain:")
|
||||||
|
for domain in domains:
|
||||||
|
domain_articles = [a for a in articles if a["domain"] == domain]
|
||||||
|
domain_words = sum(a["words"] for a in domain_articles)
|
||||||
|
print(f" {domain}: {len(domain_articles)} articles, {domain_words:,} words")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
||||||
531
lightrag/evaluation/e2e_test_harness.py
Normal file
531
lightrag/evaluation/e2e_test_harness.py
Normal file
|
|
@ -0,0 +1,531 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
E2E RAGAS Test Harness for LightRAG
|
||||||
|
|
||||||
|
Complete end-to-end testing pipeline:
|
||||||
|
1. Download arXiv papers (reproducible test data)
|
||||||
|
2. Clear existing data (optional)
|
||||||
|
3. Ingest papers into LightRAG
|
||||||
|
4. Wait for processing
|
||||||
|
5. Generate Q&A dataset
|
||||||
|
6. Run RAGAS evaluation
|
||||||
|
7. Optional: A/B comparison
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
# Full E2E test
|
||||||
|
python lightrag/evaluation/e2e_test_harness.py
|
||||||
|
|
||||||
|
# A/B comparison (with/without orphan connections)
|
||||||
|
python lightrag/evaluation/e2e_test_harness.py --ab-test
|
||||||
|
|
||||||
|
# Skip download if papers exist
|
||||||
|
python lightrag/evaluation/e2e_test_harness.py --skip-download
|
||||||
|
|
||||||
|
# Use existing dataset
|
||||||
|
python lightrag/evaluation/e2e_test_harness.py --dataset existing_dataset.json
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import asyncio
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
from datetime import datetime
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
|
# Add parent directory to path
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
|
||||||
|
|
||||||
|
# Load environment variables
|
||||||
|
load_dotenv(dotenv_path=".env", override=False)
|
||||||
|
|
||||||
|
# Configuration
|
||||||
|
DEFAULT_RAG_URL = "http://localhost:9622"
|
||||||
|
DEFAULT_PAPERS = ["2312.10997", "2404.10981", "2005.11401"]
|
||||||
|
POLL_INTERVAL_SECONDS = 10
|
||||||
|
MAX_WAIT_SECONDS = 600 # 10 minutes max wait for processing
|
||||||
|
|
||||||
|
|
||||||
|
class E2ETestHarness:
|
||||||
|
"""End-to-end test harness for LightRAG RAGAS evaluation."""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
rag_url: str = None,
|
||||||
|
paper_ids: list[str] = None,
|
||||||
|
questions_per_paper: int = 5,
|
||||||
|
skip_download: bool = False,
|
||||||
|
skip_ingest: bool = False,
|
||||||
|
dataset_path: str = None,
|
||||||
|
output_dir: str = None,
|
||||||
|
):
|
||||||
|
self.rag_url = (rag_url or os.getenv("LIGHTRAG_API_URL", DEFAULT_RAG_URL)).rstrip("/")
|
||||||
|
self.paper_ids = paper_ids or DEFAULT_PAPERS
|
||||||
|
self.questions_per_paper = questions_per_paper
|
||||||
|
self.skip_download = skip_download
|
||||||
|
self.skip_ingest = skip_ingest
|
||||||
|
self.dataset_path = Path(dataset_path) if dataset_path else None
|
||||||
|
|
||||||
|
# Determine directories
|
||||||
|
self.eval_dir = Path(__file__).parent
|
||||||
|
self.papers_dir = self.eval_dir / "papers"
|
||||||
|
self.results_dir = Path(output_dir) if output_dir else self.eval_dir / "results"
|
||||||
|
self.results_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# API key for LightRAG
|
||||||
|
self.api_key = os.getenv("LIGHTRAG_API_KEY")
|
||||||
|
|
||||||
|
async def check_lightrag_health(self) -> bool:
|
||||||
|
"""Check if LightRAG API is accessible."""
|
||||||
|
try:
|
||||||
|
async with httpx.AsyncClient(timeout=30.0) as client:
|
||||||
|
response = await client.get(f"{self.rag_url}/health")
|
||||||
|
response.raise_for_status()
|
||||||
|
print(f"[OK] LightRAG API accessible at {self.rag_url}")
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[ERROR] Cannot connect to LightRAG API: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
async def download_papers(self) -> list[str]:
|
||||||
|
"""Download arXiv papers."""
|
||||||
|
if self.skip_download:
|
||||||
|
print("[SKIP] Paper download (--skip-download)")
|
||||||
|
# Check existing papers
|
||||||
|
existing = [
|
||||||
|
str(self.papers_dir / f"{pid}.pdf")
|
||||||
|
for pid in self.paper_ids
|
||||||
|
if (self.papers_dir / f"{pid}.pdf").exists()
|
||||||
|
]
|
||||||
|
print(f"[INFO] Found {len(existing)} existing papers")
|
||||||
|
return existing
|
||||||
|
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("STEP 1: Download arXiv Papers")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
from lightrag.evaluation.download_arxiv import download_papers
|
||||||
|
|
||||||
|
results = await download_papers(self.paper_ids, self.papers_dir)
|
||||||
|
return [r["path"] for r in results if r["status"] in ("downloaded", "exists")]
|
||||||
|
|
||||||
|
async def clear_existing_data(self) -> bool:
|
||||||
|
"""Clear existing documents in LightRAG (optional)."""
|
||||||
|
print("\n[INFO] Clearing existing data...")
|
||||||
|
try:
|
||||||
|
headers = {"X-API-Key": self.api_key} if self.api_key else {}
|
||||||
|
async with httpx.AsyncClient(timeout=60.0) as client:
|
||||||
|
# Get current documents
|
||||||
|
response = await client.get(
|
||||||
|
f"{self.rag_url}/documents",
|
||||||
|
headers=headers,
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
docs = response.json()
|
||||||
|
|
||||||
|
# Clear all documents
|
||||||
|
statuses = docs.get("statuses", {})
|
||||||
|
all_docs = []
|
||||||
|
for status_docs in statuses.values():
|
||||||
|
all_docs.extend(status_docs)
|
||||||
|
|
||||||
|
if all_docs:
|
||||||
|
print(f"[INFO] Clearing {len(all_docs)} existing documents...")
|
||||||
|
for doc in all_docs:
|
||||||
|
doc_id = doc.get("id")
|
||||||
|
if doc_id:
|
||||||
|
await client.delete(
|
||||||
|
f"{self.rag_url}/documents/{doc_id}",
|
||||||
|
headers=headers,
|
||||||
|
)
|
||||||
|
print("[OK] Cleared existing documents")
|
||||||
|
else:
|
||||||
|
print("[OK] No existing documents to clear")
|
||||||
|
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[WARN] Could not clear data: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
async def ingest_papers(self, paper_paths: list[str]) -> bool:
|
||||||
|
"""Ingest papers into LightRAG."""
|
||||||
|
if self.skip_ingest:
|
||||||
|
print("[SKIP] Paper ingestion (--skip-ingest)")
|
||||||
|
return True
|
||||||
|
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("STEP 2: Ingest Papers into LightRAG")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
headers = {"X-API-Key": self.api_key} if self.api_key else {}
|
||||||
|
|
||||||
|
async with httpx.AsyncClient(timeout=300.0) as client:
|
||||||
|
for paper_path in paper_paths:
|
||||||
|
path = Path(paper_path)
|
||||||
|
if not path.exists():
|
||||||
|
print(f"[WARN] Paper not found: {paper_path}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
print(f"[UPLOAD] {path.name}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(path, "rb") as f:
|
||||||
|
files = {"file": (path.name, f, "application/pdf")}
|
||||||
|
response = await client.post(
|
||||||
|
f"{self.rag_url}/documents/upload",
|
||||||
|
files=files,
|
||||||
|
headers=headers,
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
result = response.json()
|
||||||
|
print(f" [OK] Uploaded: {result}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f" [ERROR] Upload failed: {e}")
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
async def wait_for_processing(self) -> bool:
|
||||||
|
"""Wait for all documents to finish processing."""
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("STEP 3: Wait for Document Processing")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
headers = {"X-API-Key": self.api_key} if self.api_key else {}
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
async with httpx.AsyncClient(timeout=30.0) as client:
|
||||||
|
while time.time() - start_time < MAX_WAIT_SECONDS:
|
||||||
|
try:
|
||||||
|
response = await client.get(
|
||||||
|
f"{self.rag_url}/documents",
|
||||||
|
headers=headers,
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
docs = response.json()
|
||||||
|
|
||||||
|
statuses = docs.get("statuses", {})
|
||||||
|
# API returns lowercase status keys
|
||||||
|
processing = len(statuses.get("processing", []))
|
||||||
|
pending = len(statuses.get("pending", []))
|
||||||
|
completed = len(statuses.get("processed", [])) # Note: "processed" not "completed"
|
||||||
|
failed = len(statuses.get("failed", []))
|
||||||
|
|
||||||
|
elapsed = int(time.time() - start_time)
|
||||||
|
print(f" [{elapsed}s] Processing: {processing}, Pending: {pending}, Completed: {completed}, Failed: {failed}")
|
||||||
|
|
||||||
|
if processing == 0 and pending == 0:
|
||||||
|
print("[OK] All documents processed")
|
||||||
|
return True
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f" [WARN] Status check failed: {e}")
|
||||||
|
|
||||||
|
await asyncio.sleep(POLL_INTERVAL_SECONDS)
|
||||||
|
|
||||||
|
print("[ERROR] Timeout waiting for document processing")
|
||||||
|
return False
|
||||||
|
|
||||||
|
async def generate_dataset(self) -> Path:
|
||||||
|
"""Generate Q&A dataset from ingested papers."""
|
||||||
|
if self.dataset_path and self.dataset_path.exists():
|
||||||
|
print(f"[SKIP] Using existing dataset: {self.dataset_path}")
|
||||||
|
return self.dataset_path
|
||||||
|
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("STEP 4: Generate Q&A Dataset")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
from lightrag.evaluation.generate_arxiv_dataset import generate_dataset
|
||||||
|
|
||||||
|
output_path = self.eval_dir / f"arxiv_dataset_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
|
||||||
|
|
||||||
|
await generate_dataset(
|
||||||
|
paper_ids=self.paper_ids,
|
||||||
|
questions_per_paper=self.questions_per_paper,
|
||||||
|
rag_url=self.rag_url,
|
||||||
|
output_path=output_path,
|
||||||
|
)
|
||||||
|
|
||||||
|
return output_path
|
||||||
|
|
||||||
|
async def run_ragas_evaluation(self, dataset_path: Path) -> dict:
|
||||||
|
"""Run RAGAS evaluation."""
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("STEP 5: Run RAGAS Evaluation")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
from lightrag.evaluation.eval_rag_quality import RAGEvaluator
|
||||||
|
|
||||||
|
evaluator = RAGEvaluator(
|
||||||
|
test_dataset_path=str(dataset_path),
|
||||||
|
rag_api_url=self.rag_url,
|
||||||
|
)
|
||||||
|
|
||||||
|
results = await evaluator.run()
|
||||||
|
return results
|
||||||
|
|
||||||
|
async def run_full_pipeline(self) -> dict:
|
||||||
|
"""Run the complete E2E test pipeline."""
|
||||||
|
print("=" * 70)
|
||||||
|
print("E2E RAGAS TEST HARNESS FOR LIGHTRAG")
|
||||||
|
print("=" * 70)
|
||||||
|
print(f"RAG URL: {self.rag_url}")
|
||||||
|
print(f"Papers: {', '.join(self.paper_ids)}")
|
||||||
|
print(f"Questions: {self.questions_per_paper} per paper")
|
||||||
|
print(f"Results: {self.results_dir}")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
# Check LightRAG is accessible
|
||||||
|
if not await self.check_lightrag_health():
|
||||||
|
return {"error": "LightRAG API not accessible"}
|
||||||
|
|
||||||
|
# Step 1: Download papers
|
||||||
|
paper_paths = await self.download_papers()
|
||||||
|
if not paper_paths:
|
||||||
|
return {"error": "No papers to process"}
|
||||||
|
|
||||||
|
# Step 2: Ingest papers
|
||||||
|
if not await self.ingest_papers(paper_paths):
|
||||||
|
return {"error": "Paper ingestion failed"}
|
||||||
|
|
||||||
|
# Step 3: Wait for processing
|
||||||
|
if not self.skip_ingest:
|
||||||
|
if not await self.wait_for_processing():
|
||||||
|
return {"error": "Document processing timeout"}
|
||||||
|
|
||||||
|
# Step 4: Generate dataset
|
||||||
|
dataset_path = await self.generate_dataset()
|
||||||
|
|
||||||
|
# Step 5: Run RAGAS evaluation
|
||||||
|
results = await self.run_ragas_evaluation(dataset_path)
|
||||||
|
|
||||||
|
elapsed_time = time.time() - start_time
|
||||||
|
|
||||||
|
# Save summary
|
||||||
|
summary = {
|
||||||
|
"pipeline_completed_at": datetime.now().isoformat(),
|
||||||
|
"total_elapsed_seconds": round(elapsed_time, 2),
|
||||||
|
"papers": self.paper_ids,
|
||||||
|
"dataset_path": str(dataset_path),
|
||||||
|
"ragas_results": results.get("benchmark_stats", {}),
|
||||||
|
}
|
||||||
|
|
||||||
|
summary_path = self.results_dir / f"e2e_summary_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
|
||||||
|
with open(summary_path, "w") as f:
|
||||||
|
json.dump(summary, f, indent=2)
|
||||||
|
|
||||||
|
print("\n" + "=" * 70)
|
||||||
|
print("E2E PIPELINE COMPLETE")
|
||||||
|
print("=" * 70)
|
||||||
|
print(f"Total time: {elapsed_time:.1f} seconds")
|
||||||
|
print(f"Summary saved: {summary_path}")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
return summary
|
||||||
|
|
||||||
|
|
||||||
|
async def run_ab_test(
|
||||||
|
harness_config: dict,
|
||||||
|
clear_between_runs: bool = True,
|
||||||
|
) -> dict:
|
||||||
|
"""
|
||||||
|
Run A/B test comparing with/without orphan connections.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
harness_config: Configuration for E2ETestHarness
|
||||||
|
clear_between_runs: Clear data between A and B runs
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A/B comparison results
|
||||||
|
"""
|
||||||
|
print("=" * 70)
|
||||||
|
print("A/B TEST: WITH vs WITHOUT ORPHAN CONNECTIONS")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
results = {}
|
||||||
|
|
||||||
|
# Test A: WITHOUT orphan connections
|
||||||
|
print("\n[A] Running WITHOUT orphan connections...")
|
||||||
|
os.environ["AUTO_CONNECT_ORPHANS"] = "false"
|
||||||
|
|
||||||
|
harness_a = E2ETestHarness(**harness_config)
|
||||||
|
results["without_orphans"] = await harness_a.run_full_pipeline()
|
||||||
|
|
||||||
|
# Clear for next run
|
||||||
|
if clear_between_runs:
|
||||||
|
await harness_a.clear_existing_data()
|
||||||
|
|
||||||
|
# Test B: WITH orphan connections
|
||||||
|
print("\n[B] Running WITH orphan connections...")
|
||||||
|
os.environ["AUTO_CONNECT_ORPHANS"] = "true"
|
||||||
|
|
||||||
|
# Force re-ingest for test B
|
||||||
|
harness_config_b = harness_config.copy()
|
||||||
|
harness_config_b["skip_download"] = True # Papers already downloaded
|
||||||
|
harness_config_b["skip_ingest"] = False # Need to re-ingest
|
||||||
|
|
||||||
|
harness_b = E2ETestHarness(**harness_config_b)
|
||||||
|
results["with_orphans"] = await harness_b.run_full_pipeline()
|
||||||
|
|
||||||
|
# Compare results
|
||||||
|
print("\n" + "=" * 70)
|
||||||
|
print("A/B COMPARISON")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
a_stats = results["without_orphans"].get("ragas_results", {}).get("average_metrics", {})
|
||||||
|
b_stats = results["with_orphans"].get("ragas_results", {}).get("average_metrics", {})
|
||||||
|
|
||||||
|
comparison = {
|
||||||
|
"timestamp": datetime.now().isoformat(),
|
||||||
|
"without_orphans": a_stats,
|
||||||
|
"with_orphans": b_stats,
|
||||||
|
"improvement": {},
|
||||||
|
}
|
||||||
|
|
||||||
|
for metric in ["faithfulness", "answer_relevance", "context_recall", "context_precision", "ragas_score"]:
|
||||||
|
a_val = a_stats.get(metric, 0)
|
||||||
|
b_val = b_stats.get(metric, 0)
|
||||||
|
diff = b_val - a_val
|
||||||
|
pct = (diff / a_val * 100) if a_val > 0 else 0
|
||||||
|
|
||||||
|
comparison["improvement"][metric] = {
|
||||||
|
"absolute": round(diff, 4),
|
||||||
|
"percent": round(pct, 2),
|
||||||
|
}
|
||||||
|
|
||||||
|
status = "UP" if diff > 0 else ("DOWN" if diff < 0 else "~")
|
||||||
|
print(f" {metric:<20} A: {a_val:.4f} B: {b_val:.4f} [{status}] {pct:+.1f}%")
|
||||||
|
|
||||||
|
# Verdict
|
||||||
|
ragas_improvement = comparison["improvement"].get("ragas_score", {}).get("percent", 0)
|
||||||
|
if ragas_improvement > 5:
|
||||||
|
verdict = "ORPHAN CONNECTIONS IMPROVE QUALITY"
|
||||||
|
elif ragas_improvement < -5:
|
||||||
|
verdict = "ORPHAN CONNECTIONS DEGRADE QUALITY"
|
||||||
|
else:
|
||||||
|
verdict = "NO SIGNIFICANT DIFFERENCE"
|
||||||
|
|
||||||
|
comparison["verdict"] = verdict
|
||||||
|
print(f"\nVERDICT: {verdict}")
|
||||||
|
|
||||||
|
# Save comparison
|
||||||
|
comp_path = harness_a.results_dir / f"ab_comparison_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
|
||||||
|
with open(comp_path, "w") as f:
|
||||||
|
json.dump(comparison, f, indent=2)
|
||||||
|
print(f"\nComparison saved: {comp_path}")
|
||||||
|
|
||||||
|
return comparison
|
||||||
|
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="E2E RAGAS Test Harness for LightRAG",
|
||||||
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||||
|
epilog="""
|
||||||
|
Examples:
|
||||||
|
# Full E2E test
|
||||||
|
python lightrag/evaluation/e2e_test_harness.py
|
||||||
|
|
||||||
|
# A/B test (with/without orphan connections)
|
||||||
|
python lightrag/evaluation/e2e_test_harness.py --ab-test
|
||||||
|
|
||||||
|
# Skip paper download
|
||||||
|
python lightrag/evaluation/e2e_test_harness.py --skip-download
|
||||||
|
|
||||||
|
# Use existing dataset
|
||||||
|
python lightrag/evaluation/e2e_test_harness.py --dataset arxiv_dataset.json
|
||||||
|
""",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--rag-url",
|
||||||
|
"-r",
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help=f"LightRAG API URL (default: {DEFAULT_RAG_URL})",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--papers",
|
||||||
|
"-p",
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help="Comma-separated arXiv paper IDs",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--questions",
|
||||||
|
"-q",
|
||||||
|
type=int,
|
||||||
|
default=5,
|
||||||
|
help="Questions per paper (default: 5)",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--skip-download",
|
||||||
|
action="store_true",
|
||||||
|
help="Skip paper download (use existing)",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--skip-ingest",
|
||||||
|
action="store_true",
|
||||||
|
help="Skip paper ingestion (use existing data)",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--dataset",
|
||||||
|
"-d",
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help="Path to existing Q&A dataset (skip generation)",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--output-dir",
|
||||||
|
"-o",
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help="Output directory for results",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--ab-test",
|
||||||
|
action="store_true",
|
||||||
|
help="Run A/B test comparing with/without orphan connections",
|
||||||
|
)
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# Parse paper IDs
|
||||||
|
paper_ids = None
|
||||||
|
if args.papers:
|
||||||
|
paper_ids = [p.strip() for p in args.papers.split(",")]
|
||||||
|
|
||||||
|
harness_config = {
|
||||||
|
"rag_url": args.rag_url,
|
||||||
|
"paper_ids": paper_ids,
|
||||||
|
"questions_per_paper": args.questions,
|
||||||
|
"skip_download": args.skip_download,
|
||||||
|
"skip_ingest": args.skip_ingest,
|
||||||
|
"dataset_path": args.dataset,
|
||||||
|
"output_dir": args.output_dir,
|
||||||
|
}
|
||||||
|
|
||||||
|
if args.ab_test:
|
||||||
|
await run_ab_test(harness_config)
|
||||||
|
else:
|
||||||
|
harness = E2ETestHarness(**harness_config)
|
||||||
|
await harness.run_full_pipeline()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
||||||
170
lightrag/evaluation/ingest_test_docs.py
Normal file
170
lightrag/evaluation/ingest_test_docs.py
Normal file
|
|
@ -0,0 +1,170 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Ingest test documents into LightRAG for testing.
|
||||||
|
|
||||||
|
This script reads text files from a directory and batch-uploads them to
|
||||||
|
LightRAG via the /documents/texts API endpoint, then polls for completion.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python lightrag/evaluation/ingest_test_docs.py
|
||||||
|
python lightrag/evaluation/ingest_test_docs.py --input wiki_documents/ --rag-url http://localhost:9622
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import asyncio
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
|
||||||
|
DEFAULT_RAG_URL = "http://localhost:9622"
|
||||||
|
|
||||||
|
|
||||||
|
async def ingest_documents(
|
||||||
|
input_dir: Path,
|
||||||
|
rag_url: str,
|
||||||
|
) -> dict:
|
||||||
|
"""Ingest all text files from directory into LightRAG.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
input_dir: Directory containing .txt or .md files
|
||||||
|
rag_url: LightRAG API base URL
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict with ingestion statistics
|
||||||
|
"""
|
||||||
|
timeout = httpx.Timeout(120.0, connect=30.0)
|
||||||
|
api_key = os.getenv("LIGHTRAG_API_KEY")
|
||||||
|
headers = {"X-API-Key": api_key} if api_key else {}
|
||||||
|
|
||||||
|
async with httpx.AsyncClient(timeout=timeout) as client:
|
||||||
|
# Check health
|
||||||
|
try:
|
||||||
|
health = await client.get(f"{rag_url}/health")
|
||||||
|
if health.status_code != 200:
|
||||||
|
raise ConnectionError(f"LightRAG not healthy: {health.status_code}")
|
||||||
|
except httpx.ConnectError:
|
||||||
|
raise ConnectionError(f"Cannot connect to LightRAG at {rag_url}")
|
||||||
|
|
||||||
|
print(f"✓ Connected to LightRAG at {rag_url}")
|
||||||
|
|
||||||
|
# Collect all text files
|
||||||
|
files = list(input_dir.glob("*.txt")) + list(input_dir.glob("*.md"))
|
||||||
|
if not files:
|
||||||
|
print(f"✗ No .txt or .md files found in {input_dir}")
|
||||||
|
return {"documents": 0, "elapsed_seconds": 0}
|
||||||
|
|
||||||
|
print(f" Found {len(files)} documents to ingest")
|
||||||
|
|
||||||
|
# Read all texts
|
||||||
|
texts = []
|
||||||
|
sources = []
|
||||||
|
for file in sorted(files):
|
||||||
|
content = file.read_text()
|
||||||
|
texts.append(content)
|
||||||
|
sources.append(file.name)
|
||||||
|
word_count = len(content.split())
|
||||||
|
print(f" {file.name}: {word_count:,} words")
|
||||||
|
|
||||||
|
# Batch ingest via /documents/texts
|
||||||
|
print(f"\n Uploading {len(texts)} documents...")
|
||||||
|
start = time.time()
|
||||||
|
|
||||||
|
response = await client.post(
|
||||||
|
f"{rag_url}/documents/texts",
|
||||||
|
json={"texts": texts, "file_sources": sources},
|
||||||
|
headers=headers,
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
result = response.json()
|
||||||
|
|
||||||
|
track_id = result.get("track_id", "")
|
||||||
|
print(f" Track ID: {track_id}")
|
||||||
|
|
||||||
|
# Poll for completion - wait for processing to start first
|
||||||
|
print(" Waiting for processing to start...")
|
||||||
|
await asyncio.sleep(2) # Give server time to queue documents
|
||||||
|
|
||||||
|
last_status = ""
|
||||||
|
processed_count = 0
|
||||||
|
expected_total = len(texts)
|
||||||
|
initial_check = True
|
||||||
|
|
||||||
|
while True:
|
||||||
|
status_response = await client.get(f"{rag_url}/documents")
|
||||||
|
docs = status_response.json()
|
||||||
|
statuses = docs.get("statuses", {})
|
||||||
|
|
||||||
|
processing = len(statuses.get("processing", []))
|
||||||
|
pending = len(statuses.get("pending", []))
|
||||||
|
processed = len(statuses.get("processed", []))
|
||||||
|
total_visible = processing + pending + processed
|
||||||
|
|
||||||
|
current_status = f"Pending: {pending}, Processing: {processing}, Processed: {processed}"
|
||||||
|
if current_status != last_status:
|
||||||
|
print(f" {current_status}")
|
||||||
|
last_status = current_status
|
||||||
|
processed_count = processed
|
||||||
|
|
||||||
|
# Wait until we see at least some of our docs in the queue
|
||||||
|
if initial_check and (pending > 0 or processing > 0):
|
||||||
|
initial_check = False
|
||||||
|
print(" Processing started!")
|
||||||
|
|
||||||
|
# Only exit when processing is done AND we've processed something new
|
||||||
|
if processing == 0 and pending == 0 and not initial_check:
|
||||||
|
break
|
||||||
|
|
||||||
|
await asyncio.sleep(5)
|
||||||
|
|
||||||
|
elapsed = time.time() - start
|
||||||
|
print(f"\n✓ Ingestion complete in {elapsed:.1f}s")
|
||||||
|
print(f" Documents processed: {processed_count}")
|
||||||
|
print(f" Average: {elapsed / len(texts):.1f}s per document")
|
||||||
|
|
||||||
|
return {
|
||||||
|
"documents": len(texts),
|
||||||
|
"processed": processed_count,
|
||||||
|
"elapsed_seconds": elapsed,
|
||||||
|
"track_id": track_id,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
parser = argparse.ArgumentParser(description="Ingest test documents into LightRAG")
|
||||||
|
parser.add_argument(
|
||||||
|
"--input",
|
||||||
|
"-i",
|
||||||
|
type=str,
|
||||||
|
default="lightrag/evaluation/wiki_documents",
|
||||||
|
help="Input directory with text files",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--rag-url",
|
||||||
|
"-r",
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help=f"LightRAG API URL (default: {DEFAULT_RAG_URL})",
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
input_dir = Path(args.input)
|
||||||
|
rag_url = args.rag_url or os.getenv("LIGHTRAG_API_URL", DEFAULT_RAG_URL)
|
||||||
|
|
||||||
|
print("=== LightRAG Document Ingestion ===")
|
||||||
|
print(f"Input: {input_dir}/")
|
||||||
|
print(f"RAG URL: {rag_url}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
if not input_dir.exists():
|
||||||
|
print(f"✗ Input directory not found: {input_dir}")
|
||||||
|
print(" Run download_wikipedia.py first:")
|
||||||
|
print(" python lightrag/evaluation/download_wikipedia.py")
|
||||||
|
return
|
||||||
|
|
||||||
|
await ingest_documents(input_dir, rag_url)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
||||||
43
lightrag/evaluation/populate_test_data.sh
Executable file
43
lightrag/evaluation/populate_test_data.sh
Executable file
|
|
@ -0,0 +1,43 @@
|
||||||
|
#!/bin/bash
|
||||||
|
# Quick script to populate LightRAG with diverse test documents
|
||||||
|
#
|
||||||
|
# This downloads Wikipedia articles across 4 domains (Medical, Finance, Climate, Sports)
|
||||||
|
# and ingests them into LightRAG. The articles are chosen to have entity overlap
|
||||||
|
# (WHO, Carbon/Emissions, Organizations) to test entity merging and summarization.
|
||||||
|
#
|
||||||
|
# Usage:
|
||||||
|
# ./lightrag/evaluation/populate_test_data.sh
|
||||||
|
# LIGHTRAG_API_URL=http://localhost:9622 ./lightrag/evaluation/populate_test_data.sh
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
RAG_URL="${LIGHTRAG_API_URL:-http://localhost:9622}"
|
||||||
|
|
||||||
|
echo "=== LightRAG Test Data Population ==="
|
||||||
|
echo "RAG URL: $RAG_URL"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# Check if LightRAG is running
|
||||||
|
if ! curl -s "$RAG_URL/health" > /dev/null 2>&1; then
|
||||||
|
echo "✗ Cannot connect to LightRAG at $RAG_URL"
|
||||||
|
echo " Make sure LightRAG is running first"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 1. Download Wikipedia articles
|
||||||
|
echo "[1/2] Downloading Wikipedia articles..."
|
||||||
|
python3 "$SCRIPT_DIR/download_wikipedia.py"
|
||||||
|
|
||||||
|
# 2. Ingest into LightRAG
|
||||||
|
echo ""
|
||||||
|
echo "[2/2] Ingesting documents..."
|
||||||
|
python3 "$SCRIPT_DIR/ingest_test_docs.py" --rag-url "$RAG_URL"
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "=== Done! ==="
|
||||||
|
echo "Documents ingested into LightRAG."
|
||||||
|
echo ""
|
||||||
|
echo "Next steps:"
|
||||||
|
echo " - Check graph stats: curl $RAG_URL/graph/statistics"
|
||||||
|
echo " - Query the data: curl '$RAG_URL/query?mode=global&query=What+is+climate+change'"
|
||||||
|
|
@ -280,12 +280,12 @@ async def _handle_entity_relation_summary(
|
||||||
f" Summarizing {entity_or_relation_name}: Map {len(current_list)} descriptions into {len(chunks)} groups"
|
f" Summarizing {entity_or_relation_name}: Map {len(current_list)} descriptions into {len(chunks)} groups"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Reduce phase: summarize each group from chunks
|
# Reduce phase: summarize each group from chunks IN PARALLEL
|
||||||
new_summaries = []
|
async def _summarize_single_chunk(chunk: list[str]) -> tuple[str, bool]:
|
||||||
for chunk in chunks:
|
"""Summarize a single chunk, returning (summary, used_llm)."""
|
||||||
if len(chunk) == 1:
|
if len(chunk) == 1:
|
||||||
# Optimization: single description chunks don't need LLM summarization
|
# Optimization: single description chunks don't need LLM summarization
|
||||||
new_summaries.append(chunk[0])
|
return chunk[0], False
|
||||||
else:
|
else:
|
||||||
# Multiple descriptions need LLM summarization
|
# Multiple descriptions need LLM summarization
|
||||||
summary = await _summarize_descriptions(
|
summary = await _summarize_descriptions(
|
||||||
|
|
@ -295,8 +295,18 @@ async def _handle_entity_relation_summary(
|
||||||
global_config,
|
global_config,
|
||||||
llm_response_cache,
|
llm_response_cache,
|
||||||
)
|
)
|
||||||
new_summaries.append(summary)
|
return summary, True
|
||||||
llm_was_used = True # Mark that LLM was used in reduce phase
|
|
||||||
|
# Create tasks for all chunks and run in parallel
|
||||||
|
tasks = [
|
||||||
|
asyncio.create_task(_summarize_single_chunk(chunk)) for chunk in chunks
|
||||||
|
]
|
||||||
|
results = await asyncio.gather(*tasks)
|
||||||
|
|
||||||
|
# Collect results while preserving order
|
||||||
|
new_summaries = [result[0] for result in results]
|
||||||
|
if any(result[1] for result in results):
|
||||||
|
llm_was_used = True # Mark that LLM was used in reduce phase
|
||||||
|
|
||||||
# Update current list with new summaries for next iteration
|
# Update current list with new summaries for next iteration
|
||||||
current_list = new_summaries
|
current_list = new_summaries
|
||||||
|
|
@ -2115,7 +2125,7 @@ async def _merge_nodes_then_upsert(
|
||||||
deduplicated_num = already_fragment + len(nodes_data) - num_fragment
|
deduplicated_num = already_fragment + len(nodes_data) - num_fragment
|
||||||
dd_message = ""
|
dd_message = ""
|
||||||
if deduplicated_num > 0:
|
if deduplicated_num > 0:
|
||||||
# Duplicated description detected across multiple trucks for the same entity
|
# Duplicated description detected across multiple chunks for the same entity
|
||||||
dd_message = f"dd {deduplicated_num}"
|
dd_message = f"dd {deduplicated_num}"
|
||||||
|
|
||||||
if dd_message or truncation_info_log:
|
if dd_message or truncation_info_log:
|
||||||
|
|
@ -2459,7 +2469,7 @@ async def _merge_edges_then_upsert(
|
||||||
deduplicated_num = already_fragment + len(edges_data) - num_fragment
|
deduplicated_num = already_fragment + len(edges_data) - num_fragment
|
||||||
dd_message = ""
|
dd_message = ""
|
||||||
if deduplicated_num > 0:
|
if deduplicated_num > 0:
|
||||||
# Duplicated description detected across multiple trucks for the same entity
|
# Duplicated description detected across multiple chunks for the same entity
|
||||||
dd_message = f"dd {deduplicated_num}"
|
dd_message = f"dd {deduplicated_num}"
|
||||||
|
|
||||||
if dd_message or truncation_info_log:
|
if dd_message or truncation_info_log:
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue