chore(docker-compose, lightrag): optimize test infrastructure and add evaluation tools

Add comprehensive E2E testing infrastructure with PostgreSQL performance tuning, Gunicorn multi-worker support, and evaluation scripts for RAGAS-based quality assessment. Introduces 4 new evaluation utilities: compare_results.py for A/B test analysis, download_wikipedia.py for reproducible test datasets, e2e_test_harness.py for automated evaluation pipelines, and ingest_test_docs.py for batch document ingestion. Updates docker-compose.test.yml with aggressive async settings, memory limits, and optimized chunking parameters. Parallelize entity summarization in operate.py for improved extraction performance. Fix typos in merge node/edge logs.
2025-11-29 10:39:20 +01:00 · 2025-11-29 10:39:20 +01:00 · ef7327bb3e
commit ef7327bb3e
parent d2c9e6e2ec
7 changed files with 1311 additions and 11 deletions
--- a/docker-compose.test.yml
+++ b/docker-compose.test.yml
@ -14,11 +14,35 @@ services:
      - "5433:5432"  # Use 5433 to avoid conflict with agent-sdk postgres
    volumes:
      - pgdata_test:/var/lib/postgresql/data
    command: |
      postgres
      -c shared_preload_libraries='vector,age'
      -c max_connections=150
      -c shared_buffers=768MB
      -c work_mem=32MB
      -c checkpoint_completion_target=0.9
      -c effective_cache_size=2GB
      -c maintenance_work_mem=192MB
      -c wal_compression=on
      -c checkpoint_timeout=10min
      -c max_wal_size=1GB
      -c random_page_cost=1.1
      -c effective_io_concurrency=200
      -c max_worker_processes=12
      -c max_parallel_workers_per_gather=4
      -c max_parallel_workers=8
      -c max_parallel_maintenance_workers=4
      -c jit_above_cost=50000
      -c jit_inline_above_cost=250000
      -c jit_optimize_above_cost=250000
      -c default_statistics_target=200
      -c hash_mem_multiplier=4
    healthcheck:
      test: ["CMD-SHELL", "pg_isready -U lightrag -d lightrag"]
      interval: 5s
      timeout: 5s
      retries: 5
    mem_limit: 2g
  lightrag:
    container_name: lightrag-test
@ -67,8 +91,14 @@ services:
      - ENTITY_RESOLUTION_VECTOR_THRESHOLD=0.5
      - ENTITY_RESOLUTION_MAX_CANDIDATES=3
-      # Processing
+      # Processing - Aggressive settings from agent-sdk
-      - MAX_ASYNC=4
+      - MAX_ASYNC=96
      - MAX_PARALLEL_INSERT=10
      - EMBEDDING_FUNC_MAX_ASYNC=16
      - EMBEDDING_BATCH_NUM=48
      # Gunicorn - 8 workers x 4 threads = 32 concurrent handlers
      - GUNICORN_CMD_ARGS=--workers=8 --worker-class=gthread --threads=4 --worker-connections=1000 --timeout=120 --keep-alive=5 --graceful-timeout=30
      # Extraction Optimization - Reduce Orphan Nodes
      - CHUNK_SIZE=800                         # Smaller chunks for focused extraction
@ -84,12 +114,23 @@ services:
    depends_on:
      postgres:
        condition: service_healthy
    entrypoint: []
    command:
      - python
      - /app/lightrag/api/run_with_gunicorn.py
      - --workers
      - "8"
      - --llm-binding
      - openai
      - --embedding-binding
      - openai
    healthcheck:
      test: ["CMD-SHELL", "curl -f http://localhost:9621/health || exit 1"]
      interval: 10s
      timeout: 5s
      retries: 10
-      start_period: 30s
+      start_period: 60s
    mem_limit: 2g
 volumes:
  pgdata_test:
--- a/lightrag/evaluation/compare_results.py
+++ b/lightrag/evaluation/compare_results.py
@ -0,0 +1,322 @@
 #!/usr/bin/env python3
 """
 A/B Test Results Comparator for RAGAS Evaluation
 Compares two RAGAS evaluation result files to determine if a change
 (e.g., orphan connections) improved or degraded retrieval quality.
 Usage:
    python lightrag/evaluation/compare_results.py baseline.json experiment.json
    python lightrag/evaluation/compare_results.py results_a.json results_b.json --output comparison.json
 """
 import argparse
 import json
 import math
 import sys
 from dataclasses import dataclass
 from datetime import datetime
 from pathlib import Path
 from typing import Any
@dataclass
 class MetricComparison:
    """Comparison of a single metric between two runs."""
    metric_name: str
    baseline_value: float
    experiment_value: float
    absolute_change: float
    relative_change_percent: float
    improved: bool
    significant: bool  # > 5% change
 def safe_float(value: Any, default: float = 0.0) -> float:
    """Safely convert a value to float, handling NaN."""
    if value is None:
        return default
    try:
        f = float(value)
        if math.isnan(f):
            return default
        return f
    except (ValueError, TypeError):
        return default
 def compare_metrics(baseline: dict, experiment: dict) -> list[MetricComparison]:
    """
    Compare metrics between baseline and experiment.
    Args:
        baseline: Benchmark stats from baseline run
        experiment: Benchmark stats from experiment run
    Returns:
        List of MetricComparison objects
    """
    comparisons = []
    baseline_avg = baseline.get("average_metrics", {})
    experiment_avg = experiment.get("average_metrics", {})
    metrics_to_compare = [
        ("faithfulness", "Faithfulness"),
        ("answer_relevance", "Answer Relevance"),
        ("context_recall", "Context Recall"),
        ("context_precision", "Context Precision"),
        ("ragas_score", "RAGAS Score"),
    ]
    for metric_key, metric_name in metrics_to_compare:
        b_val = safe_float(baseline_avg.get(metric_key, 0))
        e_val = safe_float(experiment_avg.get(metric_key, 0))
        abs_change = e_val - b_val
        rel_change = (abs_change / b_val * 100) if b_val > 0 else 0
        comparisons.append(MetricComparison(
            metric_name=metric_name,
            baseline_value=b_val,
            experiment_value=e_val,
            absolute_change=abs_change,
            relative_change_percent=rel_change,
            improved=abs_change > 0,
            significant=abs(rel_change) > 5,  # > 5% is significant
        ))
    return comparisons
 def analyze_results(baseline_path: Path, experiment_path: Path) -> dict:
    """
    Perform comprehensive A/B analysis.
    Args:
        baseline_path: Path to baseline results JSON
        experiment_path: Path to experiment results JSON
    Returns:
        Analysis results dictionary
    """
    # Load results
    with open(baseline_path) as f:
        baseline = json.load(f)
    with open(experiment_path) as f:
        experiment = json.load(f)
    baseline_stats = baseline.get("benchmark_stats", {})
    experiment_stats = experiment.get("benchmark_stats", {})
    # Compare metrics
    comparisons = compare_metrics(baseline_stats, experiment_stats)
    # Calculate overall verdict
    improvements = sum(1 for c in comparisons if c.improved)
    regressions = sum(1 for c in comparisons if not c.improved and c.absolute_change != 0)
    significant_improvements = sum(1 for c in comparisons if c.improved and c.significant)
    significant_regressions = sum(1 for c in comparisons if not c.improved and c.significant)
    # Determine verdict
    ragas_comparison = next((c for c in comparisons if c.metric_name == "RAGAS Score"), None)
    if ragas_comparison:
        if ragas_comparison.improved and ragas_comparison.significant:
            verdict = "SIGNIFICANT_IMPROVEMENT"
            verdict_description = f"RAGAS Score improved by {ragas_comparison.relative_change_percent:.1f}%"
        elif ragas_comparison.improved:
            verdict = "MINOR_IMPROVEMENT"
            verdict_description = f"RAGAS Score slightly improved by {ragas_comparison.relative_change_percent:.1f}%"
        elif ragas_comparison.significant:
            verdict = "SIGNIFICANT_REGRESSION"
            verdict_description = f"RAGAS Score regressed by {abs(ragas_comparison.relative_change_percent):.1f}%"
        elif ragas_comparison.absolute_change == 0:
            verdict = "NO_CHANGE"
            verdict_description = "No measurable difference between runs"
        else:
            verdict = "MINOR_REGRESSION"
            verdict_description = f"RAGAS Score slightly regressed by {abs(ragas_comparison.relative_change_percent):.1f}%"
    else:
        verdict = "UNKNOWN"
        verdict_description = "Could not determine RAGAS score comparison"
    return {
        "analysis_timestamp": datetime.now().isoformat(),
        "baseline_file": str(baseline_path),
        "experiment_file": str(experiment_path),
        "verdict": verdict,
        "verdict_description": verdict_description,
        "summary": {
            "metrics_improved": improvements,
            "metrics_regressed": regressions,
            "significant_improvements": significant_improvements,
            "significant_regressions": significant_regressions,
        },
        "metrics": [
            {
                "name": c.metric_name,
                "baseline": round(c.baseline_value, 4),
                "experiment": round(c.experiment_value, 4),
                "change": round(c.absolute_change, 4),
                "change_percent": round(c.relative_change_percent, 2),
                "improved": c.improved,
                "significant": c.significant,
            }
            for c in comparisons
        ],
        "baseline_summary": {
            "total_tests": baseline_stats.get("total_tests", 0),
            "successful_tests": baseline_stats.get("successful_tests", 0),
            "success_rate": baseline_stats.get("success_rate", 0),
        },
        "experiment_summary": {
            "total_tests": experiment_stats.get("total_tests", 0),
            "successful_tests": experiment_stats.get("successful_tests", 0),
            "success_rate": experiment_stats.get("success_rate", 0),
        },
    }
 def print_comparison_report(analysis: dict):
    """Print a formatted comparison report to stdout."""
    print("=" * 70)
    print("A/B TEST COMPARISON REPORT")
    print("=" * 70)
    print(f"Baseline:    {analysis['baseline_file']}")
    print(f"Experiment:  {analysis['experiment_file']}")
    print("-" * 70)
    # Verdict
    verdict = analysis["verdict"]
    verdict_icon = {
        "SIGNIFICANT_IMPROVEMENT": "PASS",
        "MINOR_IMPROVEMENT": "PASS",
        "NO_CHANGE": "~",
        "MINOR_REGRESSION": "WARN",
        "SIGNIFICANT_REGRESSION": "FAIL",
        "UNKNOWN": "?",
    }.get(verdict, "?")
    print(f"\n[{verdict_icon}] VERDICT: {verdict}")
    print(f"    {analysis['verdict_description']}")
    # Metrics table
    print("\n" + "-" * 70)
    print(f"{'Metric':<20} {'Baseline':>10} {'Experiment':>10} {'Change':>10} {'Status':>10}")
    print("-" * 70)
    for metric in analysis["metrics"]:
        name = metric["name"]
        baseline = f"{metric['baseline']:.4f}"
        experiment = f"{metric['experiment']:.4f}"
        change = metric["change"]
        change_pct = metric["change_percent"]
        if change > 0:
            change_str = f"+{change:.4f}"
            status = f"+{change_pct:.1f}%"
        elif change < 0:
            change_str = f"{change:.4f}"
            status = f"{change_pct:.1f}%"
        else:
            change_str = "0.0000"
            status = "0.0%"
        if metric["significant"]:
            if metric["improved"]:
                status = f"[UP] {status}"
            else:
                status = f"[DOWN] {status}"
        else:
            status = f"      {status}"
        print(f"{name:<20} {baseline:>10} {experiment:>10} {change_str:>10} {status:>10}")
    print("-" * 70)
    # Summary
    summary = analysis["summary"]
    print(f"\nSummary: {summary['metrics_improved']} improved, {summary['metrics_regressed']} regressed")
    print(f"         {summary['significant_improvements']} significant improvements, {summary['significant_regressions']} significant regressions")
    # Test counts
    b_summary = analysis["baseline_summary"]
    e_summary = analysis["experiment_summary"]
    print(f"\nBaseline:    {b_summary['successful_tests']}/{b_summary['total_tests']} tests ({b_summary['success_rate']:.1f}% success)")
    print(f"Experiment:  {e_summary['successful_tests']}/{e_summary['total_tests']} tests ({e_summary['success_rate']:.1f}% success)")
    print("=" * 70)
 def main():
    parser = argparse.ArgumentParser(
        description="Compare RAGAS evaluation results from two runs",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
 Examples:
  # Compare baseline vs experiment
  python lightrag/evaluation/compare_results.py baseline.json experiment.json
  # Save comparison to file
  python lightrag/evaluation/compare_results.py baseline.json experiment.json --output comparison.json
  # Compare with/without orphan connections
  python lightrag/evaluation/compare_results.py results_without_orphans.json results_with_orphans.json
        """,
    )
    parser.add_argument(
        "baseline",
        type=str,
        help="Path to baseline results JSON file",
    )
    parser.add_argument(
        "experiment",
        type=str,
        help="Path to experiment results JSON file",
    )
    parser.add_argument(
        "--output",
        "-o",
        type=str,
        default=None,
        help="Output path for comparison JSON (optional)",
    )
    args = parser.parse_args()
    baseline_path = Path(args.baseline)
    experiment_path = Path(args.experiment)
    # Validate files exist
    if not baseline_path.exists():
        print(f"Error: Baseline file not found: {baseline_path}")
        sys.exit(1)
    if not experiment_path.exists():
        print(f"Error: Experiment file not found: {experiment_path}")
        sys.exit(1)
    # Run analysis
    analysis = analyze_results(baseline_path, experiment_path)
    # Print report
    print_comparison_report(analysis)
    # Save to file if requested
    if args.output:
        output_path = Path(args.output)
        with open(output_path, "w") as f:
            json.dump(analysis, f, indent=2)
        print(f"\nComparison saved to: {output_path}")
    # Exit with status based on verdict
    if analysis["verdict"] in ("SIGNIFICANT_REGRESSION",):
        sys.exit(1)
    sys.exit(0)
 if __name__ == "__main__":
    main()
--- a/lightrag/evaluation/download_wikipedia.py
+++ b/lightrag/evaluation/download_wikipedia.py
@ -0,0 +1,183 @@
 #!/usr/bin/env python3
 """
 Download Wikipedia articles for LightRAG ingestion testing.
 This script fetches plain text from Wikipedia articles across diverse domains
 to create a test dataset with intentional entity overlap for testing:
 - Entity merging and summarization
 - Cross-domain relationships
 - Parallel processing optimizations
 Usage:
    python lightrag/evaluation/download_wikipedia.py
    python lightrag/evaluation/download_wikipedia.py --output wiki_docs/
    python lightrag/evaluation/download_wikipedia.py --domains medical,climate
 """
 import argparse
 import asyncio
 from pathlib import Path
 import httpx
 # Wikipedia API endpoint (no auth required)
 WIKI_API = "https://en.wikipedia.org/w/api.php"
 # User-Agent required by Wikipedia API policy
 # See: https://meta.wikimedia.org/wiki/User-Agent_policy
 USER_AGENT = "LightRAG-Test-Downloader/1.0 (https://github.com/HKUDS/LightRAG; claude@example.com)"
 # Article selection by domain - chosen for entity overlap
 # WHO → Medical + Climate
 # Carbon/Emissions → Climate + Finance (ESG)
 # Germany/Brazil → Sports + general knowledge
 ARTICLES = {
    "medical": ["Diabetes", "COVID-19"],
    "finance": ["Stock_market", "Cryptocurrency"],
    "climate": ["Climate_change", "Renewable_energy"],
    "sports": ["FIFA_World_Cup", "Olympic_Games"],
 }
 async def fetch_article(title: str, client: httpx.AsyncClient) -> dict | None:
    """Fetch Wikipedia article text via API.
    Args:
        title: Wikipedia article title (use underscores for spaces)
        client: Async HTTP client
    Returns:
        Dict with title, content, and source; or None if not found
    """
    params = {
        "action": "query",
        "titles": title,
        "prop": "extracts",
        "explaintext": True,  # Plain text, no HTML
        "format": "json",
    }
    response = await client.get(WIKI_API, params=params)
    # Check for HTTP errors
    if response.status_code != 200:
        print(f"    HTTP {response.status_code} for {title}")
        return None
    # Handle empty response
    if not response.content:
        print(f"    Empty response for {title}")
        return None
    try:
        data = response.json()
    except Exception as e:
        print(f"    JSON parse error for {title}: {e}")
        return None
    pages = data.get("query", {}).get("pages", {})
    for page_id, page in pages.items():
        if page_id != "-1":  # -1 = not found
            return {
                "title": page.get("title", title),
                "content": page.get("extract", ""),
                "source": f"wikipedia_{title}",
            }
    return None
 async def download_articles(
    domains: list[str],
    output_dir: Path,
 ) -> list[dict]:
    """Download all articles for selected domains.
    Args:
        domains: List of domain names (e.g., ["medical", "climate"])
        output_dir: Directory to save downloaded articles
    Returns:
        List of article metadata dicts
    """
    output_dir.mkdir(parents=True, exist_ok=True)
    articles = []
    headers = {"User-Agent": USER_AGENT}
    async with httpx.AsyncClient(timeout=30.0, headers=headers) as client:
        for domain in domains:
            titles = ARTICLES.get(domain, [])
            if not titles:
                print(f"[{domain.upper()}] Unknown domain, skipping")
                continue
            print(f"[{domain.upper()}] Downloading {len(titles)} articles...")
            for title in titles:
                article = await fetch_article(title, client)
                if article:
                    # Save to file
                    filename = f"{domain}_{title.lower().replace(' ', '_')}.txt"
                    filepath = output_dir / filename
                    filepath.write_text(article["content"])
                    word_count = len(article["content"].split())
                    print(f"  ✓ {title}: {word_count:,} words")
                    articles.append(
                        {
                            "domain": domain,
                            "title": article["title"],
                            "file": str(filepath),
                            "words": word_count,
                            "source": article["source"],
                        }
                    )
                else:
                    print(f"  ✗ {title}: Not found")
    return articles
 async def main():
    parser = argparse.ArgumentParser(description="Download Wikipedia test articles")
    parser.add_argument(
        "--output",
        "-o",
        type=str,
        default="lightrag/evaluation/wiki_documents",
        help="Output directory for downloaded articles",
    )
    parser.add_argument(
        "--domains",
        "-d",
        type=str,
        default="medical,finance,climate,sports",
        help="Comma-separated domains to download",
    )
    args = parser.parse_args()
    domains = [d.strip() for d in args.domains.split(",")]
    output_dir = Path(args.output)
    print("=== Wikipedia Article Downloader ===")
    print(f"Domains: {', '.join(domains)}")
    print(f"Output: {output_dir}/")
    print()
    articles = await download_articles(domains, output_dir)
    total_words = sum(a["words"] for a in articles)
    print()
    print(f"✓ Downloaded {len(articles)} articles ({total_words:,} words total)")
    print(f"  Output: {output_dir}/")
    # Print summary by domain
    print("\nBy domain:")
    for domain in domains:
        domain_articles = [a for a in articles if a["domain"] == domain]
        domain_words = sum(a["words"] for a in domain_articles)
        print(f"  {domain}: {len(domain_articles)} articles, {domain_words:,} words")
 if __name__ == "__main__":
    asyncio.run(main())
--- a/lightrag/evaluation/e2e_test_harness.py
+++ b/lightrag/evaluation/e2e_test_harness.py
@ -0,0 +1,531 @@
 #!/usr/bin/env python3
 """
 E2E RAGAS Test Harness for LightRAG
 Complete end-to-end testing pipeline:
 1. Download arXiv papers (reproducible test data)
 2. Clear existing data (optional)
 3. Ingest papers into LightRAG
 4. Wait for processing
 5. Generate Q&A dataset
 6. Run RAGAS evaluation
 7. Optional: A/B comparison
 Usage:
    # Full E2E test
    python lightrag/evaluation/e2e_test_harness.py
    # A/B comparison (with/without orphan connections)
    python lightrag/evaluation/e2e_test_harness.py --ab-test
    # Skip download if papers exist
    python lightrag/evaluation/e2e_test_harness.py --skip-download
    # Use existing dataset
    python lightrag/evaluation/e2e_test_harness.py --dataset existing_dataset.json
 """
 import argparse
 import asyncio
 import json
 import os
 import subprocess
 import sys
 import time
 from datetime import datetime
 from pathlib import Path
 from typing import Any
 import httpx
 from dotenv import load_dotenv
 # Add parent directory to path
 sys.path.insert(0, str(Path(__file__).parent.parent.parent))
 # Load environment variables
 load_dotenv(dotenv_path=".env", override=False)
 # Configuration
 DEFAULT_RAG_URL = "http://localhost:9622"
 DEFAULT_PAPERS = ["2312.10997", "2404.10981", "2005.11401"]
 POLL_INTERVAL_SECONDS = 10
 MAX_WAIT_SECONDS = 600  # 10 minutes max wait for processing
 class E2ETestHarness:
    """End-to-end test harness for LightRAG RAGAS evaluation."""
    def __init__(
        self,
        rag_url: str = None,
        paper_ids: list[str] = None,
        questions_per_paper: int = 5,
        skip_download: bool = False,
        skip_ingest: bool = False,
        dataset_path: str = None,
        output_dir: str = None,
    ):
        self.rag_url = (rag_url or os.getenv("LIGHTRAG_API_URL", DEFAULT_RAG_URL)).rstrip("/")
        self.paper_ids = paper_ids or DEFAULT_PAPERS
        self.questions_per_paper = questions_per_paper
        self.skip_download = skip_download
        self.skip_ingest = skip_ingest
        self.dataset_path = Path(dataset_path) if dataset_path else None
        # Determine directories
        self.eval_dir = Path(__file__).parent
        self.papers_dir = self.eval_dir / "papers"
        self.results_dir = Path(output_dir) if output_dir else self.eval_dir / "results"
        self.results_dir.mkdir(parents=True, exist_ok=True)
        # API key for LightRAG
        self.api_key = os.getenv("LIGHTRAG_API_KEY")
    async def check_lightrag_health(self) -> bool:
        """Check if LightRAG API is accessible."""
        try:
            async with httpx.AsyncClient(timeout=30.0) as client:
                response = await client.get(f"{self.rag_url}/health")
                response.raise_for_status()
                print(f"[OK] LightRAG API accessible at {self.rag_url}")
                return True
        except Exception as e:
            print(f"[ERROR] Cannot connect to LightRAG API: {e}")
            return False
    async def download_papers(self) -> list[str]:
        """Download arXiv papers."""
        if self.skip_download:
            print("[SKIP] Paper download (--skip-download)")
            # Check existing papers
            existing = [
                str(self.papers_dir / f"{pid}.pdf")
                for pid in self.paper_ids
                if (self.papers_dir / f"{pid}.pdf").exists()
            ]
            print(f"[INFO] Found {len(existing)} existing papers")
            return existing
        print("\n" + "=" * 60)
        print("STEP 1: Download arXiv Papers")
        print("=" * 60)
        from lightrag.evaluation.download_arxiv import download_papers
        results = await download_papers(self.paper_ids, self.papers_dir)
        return [r["path"] for r in results if r["status"] in ("downloaded", "exists")]
    async def clear_existing_data(self) -> bool:
        """Clear existing documents in LightRAG (optional)."""
        print("\n[INFO] Clearing existing data...")
        try:
            headers = {"X-API-Key": self.api_key} if self.api_key else {}
            async with httpx.AsyncClient(timeout=60.0) as client:
                # Get current documents
                response = await client.get(
                    f"{self.rag_url}/documents",
                    headers=headers,
                )
                response.raise_for_status()
                docs = response.json()
                # Clear all documents
                statuses = docs.get("statuses", {})
                all_docs = []
                for status_docs in statuses.values():
                    all_docs.extend(status_docs)
                if all_docs:
                    print(f"[INFO] Clearing {len(all_docs)} existing documents...")
                    for doc in all_docs:
                        doc_id = doc.get("id")
                        if doc_id:
                            await client.delete(
                                f"{self.rag_url}/documents/{doc_id}",
                                headers=headers,
                            )
                    print("[OK] Cleared existing documents")
                else:
                    print("[OK] No existing documents to clear")
                return True
        except Exception as e:
            print(f"[WARN] Could not clear data: {e}")
            return False
    async def ingest_papers(self, paper_paths: list[str]) -> bool:
        """Ingest papers into LightRAG."""
        if self.skip_ingest:
            print("[SKIP] Paper ingestion (--skip-ingest)")
            return True
        print("\n" + "=" * 60)
        print("STEP 2: Ingest Papers into LightRAG")
        print("=" * 60)
        headers = {"X-API-Key": self.api_key} if self.api_key else {}
        async with httpx.AsyncClient(timeout=300.0) as client:
            for paper_path in paper_paths:
                path = Path(paper_path)
                if not path.exists():
                    print(f"[WARN] Paper not found: {paper_path}")
                    continue
                print(f"[UPLOAD] {path.name}")
                try:
                    with open(path, "rb") as f:
                        files = {"file": (path.name, f, "application/pdf")}
                        response = await client.post(
                            f"{self.rag_url}/documents/upload",
                            files=files,
                            headers=headers,
                        )
                        response.raise_for_status()
                        result = response.json()
                        print(f"  [OK] Uploaded: {result}")
                except Exception as e:
                    print(f"  [ERROR] Upload failed: {e}")
        return True
    async def wait_for_processing(self) -> bool:
        """Wait for all documents to finish processing."""
        print("\n" + "=" * 60)
        print("STEP 3: Wait for Document Processing")
        print("=" * 60)
        headers = {"X-API-Key": self.api_key} if self.api_key else {}
        start_time = time.time()
        async with httpx.AsyncClient(timeout=30.0) as client:
            while time.time() - start_time < MAX_WAIT_SECONDS:
                try:
                    response = await client.get(
                        f"{self.rag_url}/documents",
                        headers=headers,
                    )
                    response.raise_for_status()
                    docs = response.json()
                    statuses = docs.get("statuses", {})
                    # API returns lowercase status keys
                    processing = len(statuses.get("processing", []))
                    pending = len(statuses.get("pending", []))
                    completed = len(statuses.get("processed", []))  # Note: "processed" not "completed"
                    failed = len(statuses.get("failed", []))
                    elapsed = int(time.time() - start_time)
                    print(f"  [{elapsed}s] Processing: {processing}, Pending: {pending}, Completed: {completed}, Failed: {failed}")
                    if processing == 0 and pending == 0:
                        print("[OK] All documents processed")
                        return True
                except Exception as e:
                    print(f"  [WARN] Status check failed: {e}")
                await asyncio.sleep(POLL_INTERVAL_SECONDS)
        print("[ERROR] Timeout waiting for document processing")
        return False
    async def generate_dataset(self) -> Path:
        """Generate Q&A dataset from ingested papers."""
        if self.dataset_path and self.dataset_path.exists():
            print(f"[SKIP] Using existing dataset: {self.dataset_path}")
            return self.dataset_path
        print("\n" + "=" * 60)
        print("STEP 4: Generate Q&A Dataset")
        print("=" * 60)
        from lightrag.evaluation.generate_arxiv_dataset import generate_dataset
        output_path = self.eval_dir / f"arxiv_dataset_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
        await generate_dataset(
            paper_ids=self.paper_ids,
            questions_per_paper=self.questions_per_paper,
            rag_url=self.rag_url,
            output_path=output_path,
        )
        return output_path
    async def run_ragas_evaluation(self, dataset_path: Path) -> dict:
        """Run RAGAS evaluation."""
        print("\n" + "=" * 60)
        print("STEP 5: Run RAGAS Evaluation")
        print("=" * 60)
        from lightrag.evaluation.eval_rag_quality import RAGEvaluator
        evaluator = RAGEvaluator(
            test_dataset_path=str(dataset_path),
            rag_api_url=self.rag_url,
        )
        results = await evaluator.run()
        return results
    async def run_full_pipeline(self) -> dict:
        """Run the complete E2E test pipeline."""
        print("=" * 70)
        print("E2E RAGAS TEST HARNESS FOR LIGHTRAG")
        print("=" * 70)
        print(f"RAG URL:    {self.rag_url}")
        print(f"Papers:     {', '.join(self.paper_ids)}")
        print(f"Questions:  {self.questions_per_paper} per paper")
        print(f"Results:    {self.results_dir}")
        print("=" * 70)
        start_time = time.time()
        # Check LightRAG is accessible
        if not await self.check_lightrag_health():
            return {"error": "LightRAG API not accessible"}
        # Step 1: Download papers
        paper_paths = await self.download_papers()
        if not paper_paths:
            return {"error": "No papers to process"}
        # Step 2: Ingest papers
        if not await self.ingest_papers(paper_paths):
            return {"error": "Paper ingestion failed"}
        # Step 3: Wait for processing
        if not self.skip_ingest:
            if not await self.wait_for_processing():
                return {"error": "Document processing timeout"}
        # Step 4: Generate dataset
        dataset_path = await self.generate_dataset()
        # Step 5: Run RAGAS evaluation
        results = await self.run_ragas_evaluation(dataset_path)
        elapsed_time = time.time() - start_time
        # Save summary
        summary = {
            "pipeline_completed_at": datetime.now().isoformat(),
            "total_elapsed_seconds": round(elapsed_time, 2),
            "papers": self.paper_ids,
            "dataset_path": str(dataset_path),
            "ragas_results": results.get("benchmark_stats", {}),
        }
        summary_path = self.results_dir / f"e2e_summary_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
        with open(summary_path, "w") as f:
            json.dump(summary, f, indent=2)
        print("\n" + "=" * 70)
        print("E2E PIPELINE COMPLETE")
        print("=" * 70)
        print(f"Total time: {elapsed_time:.1f} seconds")
        print(f"Summary saved: {summary_path}")
        print("=" * 70)
        return summary
 async def run_ab_test(
    harness_config: dict,
    clear_between_runs: bool = True,
 ) -> dict:
    """
    Run A/B test comparing with/without orphan connections.
    Args:
        harness_config: Configuration for E2ETestHarness
        clear_between_runs: Clear data between A and B runs
    Returns:
        A/B comparison results
    """
    print("=" * 70)
    print("A/B TEST: WITH vs WITHOUT ORPHAN CONNECTIONS")
    print("=" * 70)
    results = {}
    # Test A: WITHOUT orphan connections
    print("\n[A] Running WITHOUT orphan connections...")
    os.environ["AUTO_CONNECT_ORPHANS"] = "false"
    harness_a = E2ETestHarness(**harness_config)
    results["without_orphans"] = await harness_a.run_full_pipeline()
    # Clear for next run
    if clear_between_runs:
        await harness_a.clear_existing_data()
    # Test B: WITH orphan connections
    print("\n[B] Running WITH orphan connections...")
    os.environ["AUTO_CONNECT_ORPHANS"] = "true"
    # Force re-ingest for test B
    harness_config_b = harness_config.copy()
    harness_config_b["skip_download"] = True  # Papers already downloaded
    harness_config_b["skip_ingest"] = False  # Need to re-ingest
    harness_b = E2ETestHarness(**harness_config_b)
    results["with_orphans"] = await harness_b.run_full_pipeline()
    # Compare results
    print("\n" + "=" * 70)
    print("A/B COMPARISON")
    print("=" * 70)
    a_stats = results["without_orphans"].get("ragas_results", {}).get("average_metrics", {})
    b_stats = results["with_orphans"].get("ragas_results", {}).get("average_metrics", {})
    comparison = {
        "timestamp": datetime.now().isoformat(),
        "without_orphans": a_stats,
        "with_orphans": b_stats,
        "improvement": {},
    }
    for metric in ["faithfulness", "answer_relevance", "context_recall", "context_precision", "ragas_score"]:
        a_val = a_stats.get(metric, 0)
        b_val = b_stats.get(metric, 0)
        diff = b_val - a_val
        pct = (diff / a_val * 100) if a_val > 0 else 0
        comparison["improvement"][metric] = {
            "absolute": round(diff, 4),
            "percent": round(pct, 2),
        }
        status = "UP" if diff > 0 else ("DOWN" if diff < 0 else "~")
        print(f"  {metric:<20} A: {a_val:.4f}  B: {b_val:.4f}  [{status}] {pct:+.1f}%")
    # Verdict
    ragas_improvement = comparison["improvement"].get("ragas_score", {}).get("percent", 0)
    if ragas_improvement > 5:
        verdict = "ORPHAN CONNECTIONS IMPROVE QUALITY"
    elif ragas_improvement < -5:
        verdict = "ORPHAN CONNECTIONS DEGRADE QUALITY"
    else:
        verdict = "NO SIGNIFICANT DIFFERENCE"
    comparison["verdict"] = verdict
    print(f"\nVERDICT: {verdict}")
    # Save comparison
    comp_path = harness_a.results_dir / f"ab_comparison_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
    with open(comp_path, "w") as f:
        json.dump(comparison, f, indent=2)
    print(f"\nComparison saved: {comp_path}")
    return comparison
 async def main():
    parser = argparse.ArgumentParser(
        description="E2E RAGAS Test Harness for LightRAG",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
 Examples:
  # Full E2E test
  python lightrag/evaluation/e2e_test_harness.py
  # A/B test (with/without orphan connections)
  python lightrag/evaluation/e2e_test_harness.py --ab-test
  # Skip paper download
  python lightrag/evaluation/e2e_test_harness.py --skip-download
  # Use existing dataset
  python lightrag/evaluation/e2e_test_harness.py --dataset arxiv_dataset.json
        """,
    )
    parser.add_argument(
        "--rag-url",
        "-r",
        type=str,
        default=None,
        help=f"LightRAG API URL (default: {DEFAULT_RAG_URL})",
    )
    parser.add_argument(
        "--papers",
        "-p",
        type=str,
        default=None,
        help="Comma-separated arXiv paper IDs",
    )
    parser.add_argument(
        "--questions",
        "-q",
        type=int,
        default=5,
        help="Questions per paper (default: 5)",
    )
    parser.add_argument(
        "--skip-download",
        action="store_true",
        help="Skip paper download (use existing)",
    )
    parser.add_argument(
        "--skip-ingest",
        action="store_true",
        help="Skip paper ingestion (use existing data)",
    )
    parser.add_argument(
        "--dataset",
        "-d",
        type=str,
        default=None,
        help="Path to existing Q&A dataset (skip generation)",
    )
    parser.add_argument(
        "--output-dir",
        "-o",
        type=str,
        default=None,
        help="Output directory for results",
    )
    parser.add_argument(
        "--ab-test",
        action="store_true",
        help="Run A/B test comparing with/without orphan connections",
    )
    args = parser.parse_args()
    # Parse paper IDs
    paper_ids = None
    if args.papers:
        paper_ids = [p.strip() for p in args.papers.split(",")]
    harness_config = {
        "rag_url": args.rag_url,
        "paper_ids": paper_ids,
        "questions_per_paper": args.questions,
        "skip_download": args.skip_download,
        "skip_ingest": args.skip_ingest,
        "dataset_path": args.dataset,
        "output_dir": args.output_dir,
    }
    if args.ab_test:
        await run_ab_test(harness_config)
    else:
        harness = E2ETestHarness(**harness_config)
        await harness.run_full_pipeline()
 if __name__ == "__main__":
    asyncio.run(main())
--- a/lightrag/evaluation/ingest_test_docs.py
+++ b/lightrag/evaluation/ingest_test_docs.py
@ -0,0 +1,170 @@
 #!/usr/bin/env python3
 """
 Ingest test documents into LightRAG for testing.
 This script reads text files from a directory and batch-uploads them to
 LightRAG via the /documents/texts API endpoint, then polls for completion.
 Usage:
    python lightrag/evaluation/ingest_test_docs.py
    python lightrag/evaluation/ingest_test_docs.py --input wiki_documents/ --rag-url http://localhost:9622
 """
 import argparse
 import asyncio
 import os
 import time
 from pathlib import Path
 import httpx
 DEFAULT_RAG_URL = "http://localhost:9622"
 async def ingest_documents(
    input_dir: Path,
    rag_url: str,
 ) -> dict:
    """Ingest all text files from directory into LightRAG.
    Args:
        input_dir: Directory containing .txt or .md files
        rag_url: LightRAG API base URL
    Returns:
        Dict with ingestion statistics
    """
    timeout = httpx.Timeout(120.0, connect=30.0)
    api_key = os.getenv("LIGHTRAG_API_KEY")
    headers = {"X-API-Key": api_key} if api_key else {}
    async with httpx.AsyncClient(timeout=timeout) as client:
        # Check health
        try:
            health = await client.get(f"{rag_url}/health")
            if health.status_code != 200:
                raise ConnectionError(f"LightRAG not healthy: {health.status_code}")
        except httpx.ConnectError:
            raise ConnectionError(f"Cannot connect to LightRAG at {rag_url}")
        print(f"✓ Connected to LightRAG at {rag_url}")
        # Collect all text files
        files = list(input_dir.glob("*.txt")) + list(input_dir.glob("*.md"))
        if not files:
            print(f"✗ No .txt or .md files found in {input_dir}")
            return {"documents": 0, "elapsed_seconds": 0}
        print(f"  Found {len(files)} documents to ingest")
        # Read all texts
        texts = []
        sources = []
        for file in sorted(files):
            content = file.read_text()
            texts.append(content)
            sources.append(file.name)
            word_count = len(content.split())
            print(f"    {file.name}: {word_count:,} words")
        # Batch ingest via /documents/texts
        print(f"\n  Uploading {len(texts)} documents...")
        start = time.time()
        response = await client.post(
            f"{rag_url}/documents/texts",
            json={"texts": texts, "file_sources": sources},
            headers=headers,
        )
        response.raise_for_status()
        result = response.json()
        track_id = result.get("track_id", "")
        print(f"  Track ID: {track_id}")
        # Poll for completion - wait for processing to start first
        print("  Waiting for processing to start...")
        await asyncio.sleep(2)  # Give server time to queue documents
        last_status = ""
        processed_count = 0
        expected_total = len(texts)
        initial_check = True
        while True:
            status_response = await client.get(f"{rag_url}/documents")
            docs = status_response.json()
            statuses = docs.get("statuses", {})
            processing = len(statuses.get("processing", []))
            pending = len(statuses.get("pending", []))
            processed = len(statuses.get("processed", []))
            total_visible = processing + pending + processed
            current_status = f"Pending: {pending}, Processing: {processing}, Processed: {processed}"
            if current_status != last_status:
                print(f"    {current_status}")
                last_status = current_status
                processed_count = processed
            # Wait until we see at least some of our docs in the queue
            if initial_check and (pending > 0 or processing > 0):
                initial_check = False
                print("  Processing started!")
            # Only exit when processing is done AND we've processed something new
            if processing == 0 and pending == 0 and not initial_check:
                break
            await asyncio.sleep(5)
        elapsed = time.time() - start
        print(f"\n✓ Ingestion complete in {elapsed:.1f}s")
        print(f"  Documents processed: {processed_count}")
        print(f"  Average: {elapsed / len(texts):.1f}s per document")
        return {
            "documents": len(texts),
            "processed": processed_count,
            "elapsed_seconds": elapsed,
            "track_id": track_id,
        }
 async def main():
    parser = argparse.ArgumentParser(description="Ingest test documents into LightRAG")
    parser.add_argument(
        "--input",
        "-i",
        type=str,
        default="lightrag/evaluation/wiki_documents",
        help="Input directory with text files",
    )
    parser.add_argument(
        "--rag-url",
        "-r",
        type=str,
        default=None,
        help=f"LightRAG API URL (default: {DEFAULT_RAG_URL})",
    )
    args = parser.parse_args()
    input_dir = Path(args.input)
    rag_url = args.rag_url or os.getenv("LIGHTRAG_API_URL", DEFAULT_RAG_URL)
    print("=== LightRAG Document Ingestion ===")
    print(f"Input: {input_dir}/")
    print(f"RAG URL: {rag_url}")
    print()
    if not input_dir.exists():
        print(f"✗ Input directory not found: {input_dir}")
        print("  Run download_wikipedia.py first:")
        print("    python lightrag/evaluation/download_wikipedia.py")
        return
    await ingest_documents(input_dir, rag_url)
 if __name__ == "__main__":
    asyncio.run(main())
--- a/lightrag/evaluation/populate_test_data.sh
+++ b/lightrag/evaluation/populate_test_data.sh
@ -0,0 +1,43 @@
 #!/bin/bash
 # Quick script to populate LightRAG with diverse test documents
 #
 # This downloads Wikipedia articles across 4 domains (Medical, Finance, Climate, Sports)
 # and ingests them into LightRAG. The articles are chosen to have entity overlap
 # (WHO, Carbon/Emissions, Organizations) to test entity merging and summarization.
 #
 # Usage:
 #   ./lightrag/evaluation/populate_test_data.sh
 #   LIGHTRAG_API_URL=http://localhost:9622 ./lightrag/evaluation/populate_test_data.sh
 set -e
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 RAG_URL="${LIGHTRAG_API_URL:-http://localhost:9622}"
 echo "=== LightRAG Test Data Population ==="
 echo "RAG URL: $RAG_URL"
 echo ""
 # Check if LightRAG is running
 if ! curl -s "$RAG_URL/health" > /dev/null 2>&1; then
    echo "✗ Cannot connect to LightRAG at $RAG_URL"
    echo "  Make sure LightRAG is running first"
    exit 1
 fi
 # 1. Download Wikipedia articles
 echo "[1/2] Downloading Wikipedia articles..."
 python3 "$SCRIPT_DIR/download_wikipedia.py"
 # 2. Ingest into LightRAG
 echo ""
 echo "[2/2] Ingesting documents..."
 python3 "$SCRIPT_DIR/ingest_test_docs.py" --rag-url "$RAG_URL"
 echo ""
 echo "=== Done! ==="
 echo "Documents ingested into LightRAG."
 echo ""
 echo "Next steps:"
 echo "  - Check graph stats: curl $RAG_URL/graph/statistics"
 echo "  - Query the data: curl '$RAG_URL/query?mode=global&query=What+is+climate+change'"
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@ -280,12 +280,12 @@ async def _handle_entity_relation_summary(
            f"   Summarizing {entity_or_relation_name}: Map {len(current_list)} descriptions into {len(chunks)} groups"
        )
-        # Reduce phase: summarize each group from chunks
+        # Reduce phase: summarize each group from chunks IN PARALLEL
-        new_summaries = []
+        async def _summarize_single_chunk(chunk: list[str]) -> tuple[str, bool]:
-        for chunk in chunks:
+            """Summarize a single chunk, returning (summary, used_llm)."""
            if len(chunk) == 1:
                # Optimization: single description chunks don't need LLM summarization
-                new_summaries.append(chunk[0])
+                return chunk[0], False
            else:
                # Multiple descriptions need LLM summarization
                summary = await _summarize_descriptions(
@ -295,8 +295,18 @@ async def _handle_entity_relation_summary(
                    global_config,
                    llm_response_cache,
                )
-                new_summaries.append(summary)
+                return summary, True
-                llm_was_used = True  # Mark that LLM was used in reduce phase
+
        # Create tasks for all chunks and run in parallel
        tasks = [
            asyncio.create_task(_summarize_single_chunk(chunk)) for chunk in chunks
        ]
        results = await asyncio.gather(*tasks)
        # Collect results while preserving order
        new_summaries = [result[0] for result in results]
        if any(result[1] for result in results):
            llm_was_used = True  # Mark that LLM was used in reduce phase
        # Update current list with new summaries for next iteration
        current_list = new_summaries
@ -2115,7 +2125,7 @@ async def _merge_nodes_then_upsert(
    deduplicated_num = already_fragment + len(nodes_data) - num_fragment
    dd_message = ""
    if deduplicated_num > 0:
-        # Duplicated description detected across multiple trucks for the same entity
+        # Duplicated description detected across multiple chunks for the same entity
        dd_message = f"dd {deduplicated_num}"
    if dd_message or truncation_info_log:
@ -2459,7 +2469,7 @@ async def _merge_edges_then_upsert(
    deduplicated_num = already_fragment + len(edges_data) - num_fragment
    dd_message = ""
    if deduplicated_num > 0:
-        # Duplicated description detected across multiple trucks for the same entity
+        # Duplicated description detected across multiple chunks for the same entity
        dd_message = f"dd {deduplicated_num}"
    if dd_message or truncation_info_log: