Add comprehensive E2E testing infrastructure with PostgreSQL performance tuning, Gunicorn multi-worker support, and evaluation scripts for RAGAS-based quality assessment. Introduces 4 new evaluation utilities: compare_results.py for A/B test analysis, download_wikipedia.py for reproducible test datasets, e2e_test_harness.py for automated evaluation pipelines, and ingest_test_docs.py for batch document ingestion. Updates docker-compose.test.yml with aggressive async settings, memory limits, and optimized chunking parameters. Parallelize entity summarization in operate.py for improved extraction performance. Fix typos in merge node/edge logs.
531 lines
17 KiB
Python
531 lines
17 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
E2E RAGAS Test Harness for LightRAG
|
|
|
|
Complete end-to-end testing pipeline:
|
|
1. Download arXiv papers (reproducible test data)
|
|
2. Clear existing data (optional)
|
|
3. Ingest papers into LightRAG
|
|
4. Wait for processing
|
|
5. Generate Q&A dataset
|
|
6. Run RAGAS evaluation
|
|
7. Optional: A/B comparison
|
|
|
|
Usage:
|
|
# Full E2E test
|
|
python lightrag/evaluation/e2e_test_harness.py
|
|
|
|
# A/B comparison (with/without orphan connections)
|
|
python lightrag/evaluation/e2e_test_harness.py --ab-test
|
|
|
|
# Skip download if papers exist
|
|
python lightrag/evaluation/e2e_test_harness.py --skip-download
|
|
|
|
# Use existing dataset
|
|
python lightrag/evaluation/e2e_test_harness.py --dataset existing_dataset.json
|
|
"""
|
|
|
|
import argparse
|
|
import asyncio
|
|
import json
|
|
import os
|
|
import subprocess
|
|
import sys
|
|
import time
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
import httpx
|
|
from dotenv import load_dotenv
|
|
|
|
# Add parent directory to path
|
|
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
|
|
|
|
# Load environment variables
|
|
load_dotenv(dotenv_path=".env", override=False)
|
|
|
|
# Configuration
|
|
DEFAULT_RAG_URL = "http://localhost:9622"
|
|
DEFAULT_PAPERS = ["2312.10997", "2404.10981", "2005.11401"]
|
|
POLL_INTERVAL_SECONDS = 10
|
|
MAX_WAIT_SECONDS = 600 # 10 minutes max wait for processing
|
|
|
|
|
|
class E2ETestHarness:
|
|
"""End-to-end test harness for LightRAG RAGAS evaluation."""
|
|
|
|
def __init__(
|
|
self,
|
|
rag_url: str = None,
|
|
paper_ids: list[str] = None,
|
|
questions_per_paper: int = 5,
|
|
skip_download: bool = False,
|
|
skip_ingest: bool = False,
|
|
dataset_path: str = None,
|
|
output_dir: str = None,
|
|
):
|
|
self.rag_url = (rag_url or os.getenv("LIGHTRAG_API_URL", DEFAULT_RAG_URL)).rstrip("/")
|
|
self.paper_ids = paper_ids or DEFAULT_PAPERS
|
|
self.questions_per_paper = questions_per_paper
|
|
self.skip_download = skip_download
|
|
self.skip_ingest = skip_ingest
|
|
self.dataset_path = Path(dataset_path) if dataset_path else None
|
|
|
|
# Determine directories
|
|
self.eval_dir = Path(__file__).parent
|
|
self.papers_dir = self.eval_dir / "papers"
|
|
self.results_dir = Path(output_dir) if output_dir else self.eval_dir / "results"
|
|
self.results_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# API key for LightRAG
|
|
self.api_key = os.getenv("LIGHTRAG_API_KEY")
|
|
|
|
async def check_lightrag_health(self) -> bool:
|
|
"""Check if LightRAG API is accessible."""
|
|
try:
|
|
async with httpx.AsyncClient(timeout=30.0) as client:
|
|
response = await client.get(f"{self.rag_url}/health")
|
|
response.raise_for_status()
|
|
print(f"[OK] LightRAG API accessible at {self.rag_url}")
|
|
return True
|
|
except Exception as e:
|
|
print(f"[ERROR] Cannot connect to LightRAG API: {e}")
|
|
return False
|
|
|
|
async def download_papers(self) -> list[str]:
|
|
"""Download arXiv papers."""
|
|
if self.skip_download:
|
|
print("[SKIP] Paper download (--skip-download)")
|
|
# Check existing papers
|
|
existing = [
|
|
str(self.papers_dir / f"{pid}.pdf")
|
|
for pid in self.paper_ids
|
|
if (self.papers_dir / f"{pid}.pdf").exists()
|
|
]
|
|
print(f"[INFO] Found {len(existing)} existing papers")
|
|
return existing
|
|
|
|
print("\n" + "=" * 60)
|
|
print("STEP 1: Download arXiv Papers")
|
|
print("=" * 60)
|
|
|
|
from lightrag.evaluation.download_arxiv import download_papers
|
|
|
|
results = await download_papers(self.paper_ids, self.papers_dir)
|
|
return [r["path"] for r in results if r["status"] in ("downloaded", "exists")]
|
|
|
|
async def clear_existing_data(self) -> bool:
|
|
"""Clear existing documents in LightRAG (optional)."""
|
|
print("\n[INFO] Clearing existing data...")
|
|
try:
|
|
headers = {"X-API-Key": self.api_key} if self.api_key else {}
|
|
async with httpx.AsyncClient(timeout=60.0) as client:
|
|
# Get current documents
|
|
response = await client.get(
|
|
f"{self.rag_url}/documents",
|
|
headers=headers,
|
|
)
|
|
response.raise_for_status()
|
|
docs = response.json()
|
|
|
|
# Clear all documents
|
|
statuses = docs.get("statuses", {})
|
|
all_docs = []
|
|
for status_docs in statuses.values():
|
|
all_docs.extend(status_docs)
|
|
|
|
if all_docs:
|
|
print(f"[INFO] Clearing {len(all_docs)} existing documents...")
|
|
for doc in all_docs:
|
|
doc_id = doc.get("id")
|
|
if doc_id:
|
|
await client.delete(
|
|
f"{self.rag_url}/documents/{doc_id}",
|
|
headers=headers,
|
|
)
|
|
print("[OK] Cleared existing documents")
|
|
else:
|
|
print("[OK] No existing documents to clear")
|
|
|
|
return True
|
|
except Exception as e:
|
|
print(f"[WARN] Could not clear data: {e}")
|
|
return False
|
|
|
|
async def ingest_papers(self, paper_paths: list[str]) -> bool:
|
|
"""Ingest papers into LightRAG."""
|
|
if self.skip_ingest:
|
|
print("[SKIP] Paper ingestion (--skip-ingest)")
|
|
return True
|
|
|
|
print("\n" + "=" * 60)
|
|
print("STEP 2: Ingest Papers into LightRAG")
|
|
print("=" * 60)
|
|
|
|
headers = {"X-API-Key": self.api_key} if self.api_key else {}
|
|
|
|
async with httpx.AsyncClient(timeout=300.0) as client:
|
|
for paper_path in paper_paths:
|
|
path = Path(paper_path)
|
|
if not path.exists():
|
|
print(f"[WARN] Paper not found: {paper_path}")
|
|
continue
|
|
|
|
print(f"[UPLOAD] {path.name}")
|
|
|
|
try:
|
|
with open(path, "rb") as f:
|
|
files = {"file": (path.name, f, "application/pdf")}
|
|
response = await client.post(
|
|
f"{self.rag_url}/documents/upload",
|
|
files=files,
|
|
headers=headers,
|
|
)
|
|
response.raise_for_status()
|
|
result = response.json()
|
|
print(f" [OK] Uploaded: {result}")
|
|
except Exception as e:
|
|
print(f" [ERROR] Upload failed: {e}")
|
|
|
|
return True
|
|
|
|
async def wait_for_processing(self) -> bool:
|
|
"""Wait for all documents to finish processing."""
|
|
print("\n" + "=" * 60)
|
|
print("STEP 3: Wait for Document Processing")
|
|
print("=" * 60)
|
|
|
|
headers = {"X-API-Key": self.api_key} if self.api_key else {}
|
|
start_time = time.time()
|
|
|
|
async with httpx.AsyncClient(timeout=30.0) as client:
|
|
while time.time() - start_time < MAX_WAIT_SECONDS:
|
|
try:
|
|
response = await client.get(
|
|
f"{self.rag_url}/documents",
|
|
headers=headers,
|
|
)
|
|
response.raise_for_status()
|
|
docs = response.json()
|
|
|
|
statuses = docs.get("statuses", {})
|
|
# API returns lowercase status keys
|
|
processing = len(statuses.get("processing", []))
|
|
pending = len(statuses.get("pending", []))
|
|
completed = len(statuses.get("processed", [])) # Note: "processed" not "completed"
|
|
failed = len(statuses.get("failed", []))
|
|
|
|
elapsed = int(time.time() - start_time)
|
|
print(f" [{elapsed}s] Processing: {processing}, Pending: {pending}, Completed: {completed}, Failed: {failed}")
|
|
|
|
if processing == 0 and pending == 0:
|
|
print("[OK] All documents processed")
|
|
return True
|
|
|
|
except Exception as e:
|
|
print(f" [WARN] Status check failed: {e}")
|
|
|
|
await asyncio.sleep(POLL_INTERVAL_SECONDS)
|
|
|
|
print("[ERROR] Timeout waiting for document processing")
|
|
return False
|
|
|
|
async def generate_dataset(self) -> Path:
|
|
"""Generate Q&A dataset from ingested papers."""
|
|
if self.dataset_path and self.dataset_path.exists():
|
|
print(f"[SKIP] Using existing dataset: {self.dataset_path}")
|
|
return self.dataset_path
|
|
|
|
print("\n" + "=" * 60)
|
|
print("STEP 4: Generate Q&A Dataset")
|
|
print("=" * 60)
|
|
|
|
from lightrag.evaluation.generate_arxiv_dataset import generate_dataset
|
|
|
|
output_path = self.eval_dir / f"arxiv_dataset_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
|
|
|
|
await generate_dataset(
|
|
paper_ids=self.paper_ids,
|
|
questions_per_paper=self.questions_per_paper,
|
|
rag_url=self.rag_url,
|
|
output_path=output_path,
|
|
)
|
|
|
|
return output_path
|
|
|
|
async def run_ragas_evaluation(self, dataset_path: Path) -> dict:
|
|
"""Run RAGAS evaluation."""
|
|
print("\n" + "=" * 60)
|
|
print("STEP 5: Run RAGAS Evaluation")
|
|
print("=" * 60)
|
|
|
|
from lightrag.evaluation.eval_rag_quality import RAGEvaluator
|
|
|
|
evaluator = RAGEvaluator(
|
|
test_dataset_path=str(dataset_path),
|
|
rag_api_url=self.rag_url,
|
|
)
|
|
|
|
results = await evaluator.run()
|
|
return results
|
|
|
|
async def run_full_pipeline(self) -> dict:
|
|
"""Run the complete E2E test pipeline."""
|
|
print("=" * 70)
|
|
print("E2E RAGAS TEST HARNESS FOR LIGHTRAG")
|
|
print("=" * 70)
|
|
print(f"RAG URL: {self.rag_url}")
|
|
print(f"Papers: {', '.join(self.paper_ids)}")
|
|
print(f"Questions: {self.questions_per_paper} per paper")
|
|
print(f"Results: {self.results_dir}")
|
|
print("=" * 70)
|
|
|
|
start_time = time.time()
|
|
|
|
# Check LightRAG is accessible
|
|
if not await self.check_lightrag_health():
|
|
return {"error": "LightRAG API not accessible"}
|
|
|
|
# Step 1: Download papers
|
|
paper_paths = await self.download_papers()
|
|
if not paper_paths:
|
|
return {"error": "No papers to process"}
|
|
|
|
# Step 2: Ingest papers
|
|
if not await self.ingest_papers(paper_paths):
|
|
return {"error": "Paper ingestion failed"}
|
|
|
|
# Step 3: Wait for processing
|
|
if not self.skip_ingest:
|
|
if not await self.wait_for_processing():
|
|
return {"error": "Document processing timeout"}
|
|
|
|
# Step 4: Generate dataset
|
|
dataset_path = await self.generate_dataset()
|
|
|
|
# Step 5: Run RAGAS evaluation
|
|
results = await self.run_ragas_evaluation(dataset_path)
|
|
|
|
elapsed_time = time.time() - start_time
|
|
|
|
# Save summary
|
|
summary = {
|
|
"pipeline_completed_at": datetime.now().isoformat(),
|
|
"total_elapsed_seconds": round(elapsed_time, 2),
|
|
"papers": self.paper_ids,
|
|
"dataset_path": str(dataset_path),
|
|
"ragas_results": results.get("benchmark_stats", {}),
|
|
}
|
|
|
|
summary_path = self.results_dir / f"e2e_summary_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
|
|
with open(summary_path, "w") as f:
|
|
json.dump(summary, f, indent=2)
|
|
|
|
print("\n" + "=" * 70)
|
|
print("E2E PIPELINE COMPLETE")
|
|
print("=" * 70)
|
|
print(f"Total time: {elapsed_time:.1f} seconds")
|
|
print(f"Summary saved: {summary_path}")
|
|
print("=" * 70)
|
|
|
|
return summary
|
|
|
|
|
|
async def run_ab_test(
|
|
harness_config: dict,
|
|
clear_between_runs: bool = True,
|
|
) -> dict:
|
|
"""
|
|
Run A/B test comparing with/without orphan connections.
|
|
|
|
Args:
|
|
harness_config: Configuration for E2ETestHarness
|
|
clear_between_runs: Clear data between A and B runs
|
|
|
|
Returns:
|
|
A/B comparison results
|
|
"""
|
|
print("=" * 70)
|
|
print("A/B TEST: WITH vs WITHOUT ORPHAN CONNECTIONS")
|
|
print("=" * 70)
|
|
|
|
results = {}
|
|
|
|
# Test A: WITHOUT orphan connections
|
|
print("\n[A] Running WITHOUT orphan connections...")
|
|
os.environ["AUTO_CONNECT_ORPHANS"] = "false"
|
|
|
|
harness_a = E2ETestHarness(**harness_config)
|
|
results["without_orphans"] = await harness_a.run_full_pipeline()
|
|
|
|
# Clear for next run
|
|
if clear_between_runs:
|
|
await harness_a.clear_existing_data()
|
|
|
|
# Test B: WITH orphan connections
|
|
print("\n[B] Running WITH orphan connections...")
|
|
os.environ["AUTO_CONNECT_ORPHANS"] = "true"
|
|
|
|
# Force re-ingest for test B
|
|
harness_config_b = harness_config.copy()
|
|
harness_config_b["skip_download"] = True # Papers already downloaded
|
|
harness_config_b["skip_ingest"] = False # Need to re-ingest
|
|
|
|
harness_b = E2ETestHarness(**harness_config_b)
|
|
results["with_orphans"] = await harness_b.run_full_pipeline()
|
|
|
|
# Compare results
|
|
print("\n" + "=" * 70)
|
|
print("A/B COMPARISON")
|
|
print("=" * 70)
|
|
|
|
a_stats = results["without_orphans"].get("ragas_results", {}).get("average_metrics", {})
|
|
b_stats = results["with_orphans"].get("ragas_results", {}).get("average_metrics", {})
|
|
|
|
comparison = {
|
|
"timestamp": datetime.now().isoformat(),
|
|
"without_orphans": a_stats,
|
|
"with_orphans": b_stats,
|
|
"improvement": {},
|
|
}
|
|
|
|
for metric in ["faithfulness", "answer_relevance", "context_recall", "context_precision", "ragas_score"]:
|
|
a_val = a_stats.get(metric, 0)
|
|
b_val = b_stats.get(metric, 0)
|
|
diff = b_val - a_val
|
|
pct = (diff / a_val * 100) if a_val > 0 else 0
|
|
|
|
comparison["improvement"][metric] = {
|
|
"absolute": round(diff, 4),
|
|
"percent": round(pct, 2),
|
|
}
|
|
|
|
status = "UP" if diff > 0 else ("DOWN" if diff < 0 else "~")
|
|
print(f" {metric:<20} A: {a_val:.4f} B: {b_val:.4f} [{status}] {pct:+.1f}%")
|
|
|
|
# Verdict
|
|
ragas_improvement = comparison["improvement"].get("ragas_score", {}).get("percent", 0)
|
|
if ragas_improvement > 5:
|
|
verdict = "ORPHAN CONNECTIONS IMPROVE QUALITY"
|
|
elif ragas_improvement < -5:
|
|
verdict = "ORPHAN CONNECTIONS DEGRADE QUALITY"
|
|
else:
|
|
verdict = "NO SIGNIFICANT DIFFERENCE"
|
|
|
|
comparison["verdict"] = verdict
|
|
print(f"\nVERDICT: {verdict}")
|
|
|
|
# Save comparison
|
|
comp_path = harness_a.results_dir / f"ab_comparison_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
|
|
with open(comp_path, "w") as f:
|
|
json.dump(comparison, f, indent=2)
|
|
print(f"\nComparison saved: {comp_path}")
|
|
|
|
return comparison
|
|
|
|
|
|
async def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="E2E RAGAS Test Harness for LightRAG",
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog="""
|
|
Examples:
|
|
# Full E2E test
|
|
python lightrag/evaluation/e2e_test_harness.py
|
|
|
|
# A/B test (with/without orphan connections)
|
|
python lightrag/evaluation/e2e_test_harness.py --ab-test
|
|
|
|
# Skip paper download
|
|
python lightrag/evaluation/e2e_test_harness.py --skip-download
|
|
|
|
# Use existing dataset
|
|
python lightrag/evaluation/e2e_test_harness.py --dataset arxiv_dataset.json
|
|
""",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--rag-url",
|
|
"-r",
|
|
type=str,
|
|
default=None,
|
|
help=f"LightRAG API URL (default: {DEFAULT_RAG_URL})",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--papers",
|
|
"-p",
|
|
type=str,
|
|
default=None,
|
|
help="Comma-separated arXiv paper IDs",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--questions",
|
|
"-q",
|
|
type=int,
|
|
default=5,
|
|
help="Questions per paper (default: 5)",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--skip-download",
|
|
action="store_true",
|
|
help="Skip paper download (use existing)",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--skip-ingest",
|
|
action="store_true",
|
|
help="Skip paper ingestion (use existing data)",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--dataset",
|
|
"-d",
|
|
type=str,
|
|
default=None,
|
|
help="Path to existing Q&A dataset (skip generation)",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--output-dir",
|
|
"-o",
|
|
type=str,
|
|
default=None,
|
|
help="Output directory for results",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--ab-test",
|
|
action="store_true",
|
|
help="Run A/B test comparing with/without orphan connections",
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Parse paper IDs
|
|
paper_ids = None
|
|
if args.papers:
|
|
paper_ids = [p.strip() for p in args.papers.split(",")]
|
|
|
|
harness_config = {
|
|
"rag_url": args.rag_url,
|
|
"paper_ids": paper_ids,
|
|
"questions_per_paper": args.questions,
|
|
"skip_download": args.skip_download,
|
|
"skip_ingest": args.skip_ingest,
|
|
"dataset_path": args.dataset,
|
|
"output_dir": args.output_dir,
|
|
}
|
|
|
|
if args.ab_test:
|
|
await run_ab_test(harness_config)
|
|
else:
|
|
harness = E2ETestHarness(**harness_config)
|
|
await harness.run_full_pipeline()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|