Add comprehensive E2E testing infrastructure with PostgreSQL performance tuning, Gunicorn multi-worker support, and evaluation scripts for RAGAS-based quality assessment. Introduces 4 new evaluation utilities: compare_results.py for A/B test analysis, download_wikipedia.py for reproducible test datasets, e2e_test_harness.py for automated evaluation pipelines, and ingest_test_docs.py for batch document ingestion. Updates docker-compose.test.yml with aggressive async settings, memory limits, and optimized chunking parameters. Parallelize entity summarization in operate.py for improved extraction performance. Fix typos in merge node/edge logs.
322 lines
10 KiB
Python
322 lines
10 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
A/B Test Results Comparator for RAGAS Evaluation
|
|
|
|
Compares two RAGAS evaluation result files to determine if a change
|
|
(e.g., orphan connections) improved or degraded retrieval quality.
|
|
|
|
Usage:
|
|
python lightrag/evaluation/compare_results.py baseline.json experiment.json
|
|
python lightrag/evaluation/compare_results.py results_a.json results_b.json --output comparison.json
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import math
|
|
import sys
|
|
from dataclasses import dataclass
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
|
|
@dataclass
|
|
class MetricComparison:
|
|
"""Comparison of a single metric between two runs."""
|
|
metric_name: str
|
|
baseline_value: float
|
|
experiment_value: float
|
|
absolute_change: float
|
|
relative_change_percent: float
|
|
improved: bool
|
|
significant: bool # > 5% change
|
|
|
|
|
|
def safe_float(value: Any, default: float = 0.0) -> float:
|
|
"""Safely convert a value to float, handling NaN."""
|
|
if value is None:
|
|
return default
|
|
try:
|
|
f = float(value)
|
|
if math.isnan(f):
|
|
return default
|
|
return f
|
|
except (ValueError, TypeError):
|
|
return default
|
|
|
|
|
|
def compare_metrics(baseline: dict, experiment: dict) -> list[MetricComparison]:
|
|
"""
|
|
Compare metrics between baseline and experiment.
|
|
|
|
Args:
|
|
baseline: Benchmark stats from baseline run
|
|
experiment: Benchmark stats from experiment run
|
|
|
|
Returns:
|
|
List of MetricComparison objects
|
|
"""
|
|
comparisons = []
|
|
|
|
baseline_avg = baseline.get("average_metrics", {})
|
|
experiment_avg = experiment.get("average_metrics", {})
|
|
|
|
metrics_to_compare = [
|
|
("faithfulness", "Faithfulness"),
|
|
("answer_relevance", "Answer Relevance"),
|
|
("context_recall", "Context Recall"),
|
|
("context_precision", "Context Precision"),
|
|
("ragas_score", "RAGAS Score"),
|
|
]
|
|
|
|
for metric_key, metric_name in metrics_to_compare:
|
|
b_val = safe_float(baseline_avg.get(metric_key, 0))
|
|
e_val = safe_float(experiment_avg.get(metric_key, 0))
|
|
|
|
abs_change = e_val - b_val
|
|
rel_change = (abs_change / b_val * 100) if b_val > 0 else 0
|
|
|
|
comparisons.append(MetricComparison(
|
|
metric_name=metric_name,
|
|
baseline_value=b_val,
|
|
experiment_value=e_val,
|
|
absolute_change=abs_change,
|
|
relative_change_percent=rel_change,
|
|
improved=abs_change > 0,
|
|
significant=abs(rel_change) > 5, # > 5% is significant
|
|
))
|
|
|
|
return comparisons
|
|
|
|
|
|
def analyze_results(baseline_path: Path, experiment_path: Path) -> dict:
|
|
"""
|
|
Perform comprehensive A/B analysis.
|
|
|
|
Args:
|
|
baseline_path: Path to baseline results JSON
|
|
experiment_path: Path to experiment results JSON
|
|
|
|
Returns:
|
|
Analysis results dictionary
|
|
"""
|
|
# Load results
|
|
with open(baseline_path) as f:
|
|
baseline = json.load(f)
|
|
with open(experiment_path) as f:
|
|
experiment = json.load(f)
|
|
|
|
baseline_stats = baseline.get("benchmark_stats", {})
|
|
experiment_stats = experiment.get("benchmark_stats", {})
|
|
|
|
# Compare metrics
|
|
comparisons = compare_metrics(baseline_stats, experiment_stats)
|
|
|
|
# Calculate overall verdict
|
|
improvements = sum(1 for c in comparisons if c.improved)
|
|
regressions = sum(1 for c in comparisons if not c.improved and c.absolute_change != 0)
|
|
significant_improvements = sum(1 for c in comparisons if c.improved and c.significant)
|
|
significant_regressions = sum(1 for c in comparisons if not c.improved and c.significant)
|
|
|
|
# Determine verdict
|
|
ragas_comparison = next((c for c in comparisons if c.metric_name == "RAGAS Score"), None)
|
|
|
|
if ragas_comparison:
|
|
if ragas_comparison.improved and ragas_comparison.significant:
|
|
verdict = "SIGNIFICANT_IMPROVEMENT"
|
|
verdict_description = f"RAGAS Score improved by {ragas_comparison.relative_change_percent:.1f}%"
|
|
elif ragas_comparison.improved:
|
|
verdict = "MINOR_IMPROVEMENT"
|
|
verdict_description = f"RAGAS Score slightly improved by {ragas_comparison.relative_change_percent:.1f}%"
|
|
elif ragas_comparison.significant:
|
|
verdict = "SIGNIFICANT_REGRESSION"
|
|
verdict_description = f"RAGAS Score regressed by {abs(ragas_comparison.relative_change_percent):.1f}%"
|
|
elif ragas_comparison.absolute_change == 0:
|
|
verdict = "NO_CHANGE"
|
|
verdict_description = "No measurable difference between runs"
|
|
else:
|
|
verdict = "MINOR_REGRESSION"
|
|
verdict_description = f"RAGAS Score slightly regressed by {abs(ragas_comparison.relative_change_percent):.1f}%"
|
|
else:
|
|
verdict = "UNKNOWN"
|
|
verdict_description = "Could not determine RAGAS score comparison"
|
|
|
|
return {
|
|
"analysis_timestamp": datetime.now().isoformat(),
|
|
"baseline_file": str(baseline_path),
|
|
"experiment_file": str(experiment_path),
|
|
"verdict": verdict,
|
|
"verdict_description": verdict_description,
|
|
"summary": {
|
|
"metrics_improved": improvements,
|
|
"metrics_regressed": regressions,
|
|
"significant_improvements": significant_improvements,
|
|
"significant_regressions": significant_regressions,
|
|
},
|
|
"metrics": [
|
|
{
|
|
"name": c.metric_name,
|
|
"baseline": round(c.baseline_value, 4),
|
|
"experiment": round(c.experiment_value, 4),
|
|
"change": round(c.absolute_change, 4),
|
|
"change_percent": round(c.relative_change_percent, 2),
|
|
"improved": c.improved,
|
|
"significant": c.significant,
|
|
}
|
|
for c in comparisons
|
|
],
|
|
"baseline_summary": {
|
|
"total_tests": baseline_stats.get("total_tests", 0),
|
|
"successful_tests": baseline_stats.get("successful_tests", 0),
|
|
"success_rate": baseline_stats.get("success_rate", 0),
|
|
},
|
|
"experiment_summary": {
|
|
"total_tests": experiment_stats.get("total_tests", 0),
|
|
"successful_tests": experiment_stats.get("successful_tests", 0),
|
|
"success_rate": experiment_stats.get("success_rate", 0),
|
|
},
|
|
}
|
|
|
|
|
|
def print_comparison_report(analysis: dict):
|
|
"""Print a formatted comparison report to stdout."""
|
|
print("=" * 70)
|
|
print("A/B TEST COMPARISON REPORT")
|
|
print("=" * 70)
|
|
print(f"Baseline: {analysis['baseline_file']}")
|
|
print(f"Experiment: {analysis['experiment_file']}")
|
|
print("-" * 70)
|
|
|
|
# Verdict
|
|
verdict = analysis["verdict"]
|
|
verdict_icon = {
|
|
"SIGNIFICANT_IMPROVEMENT": "PASS",
|
|
"MINOR_IMPROVEMENT": "PASS",
|
|
"NO_CHANGE": "~",
|
|
"MINOR_REGRESSION": "WARN",
|
|
"SIGNIFICANT_REGRESSION": "FAIL",
|
|
"UNKNOWN": "?",
|
|
}.get(verdict, "?")
|
|
|
|
print(f"\n[{verdict_icon}] VERDICT: {verdict}")
|
|
print(f" {analysis['verdict_description']}")
|
|
|
|
# Metrics table
|
|
print("\n" + "-" * 70)
|
|
print(f"{'Metric':<20} {'Baseline':>10} {'Experiment':>10} {'Change':>10} {'Status':>10}")
|
|
print("-" * 70)
|
|
|
|
for metric in analysis["metrics"]:
|
|
name = metric["name"]
|
|
baseline = f"{metric['baseline']:.4f}"
|
|
experiment = f"{metric['experiment']:.4f}"
|
|
|
|
change = metric["change"]
|
|
change_pct = metric["change_percent"]
|
|
if change > 0:
|
|
change_str = f"+{change:.4f}"
|
|
status = f"+{change_pct:.1f}%"
|
|
elif change < 0:
|
|
change_str = f"{change:.4f}"
|
|
status = f"{change_pct:.1f}%"
|
|
else:
|
|
change_str = "0.0000"
|
|
status = "0.0%"
|
|
|
|
if metric["significant"]:
|
|
if metric["improved"]:
|
|
status = f"[UP] {status}"
|
|
else:
|
|
status = f"[DOWN] {status}"
|
|
else:
|
|
status = f" {status}"
|
|
|
|
print(f"{name:<20} {baseline:>10} {experiment:>10} {change_str:>10} {status:>10}")
|
|
|
|
print("-" * 70)
|
|
|
|
# Summary
|
|
summary = analysis["summary"]
|
|
print(f"\nSummary: {summary['metrics_improved']} improved, {summary['metrics_regressed']} regressed")
|
|
print(f" {summary['significant_improvements']} significant improvements, {summary['significant_regressions']} significant regressions")
|
|
|
|
# Test counts
|
|
b_summary = analysis["baseline_summary"]
|
|
e_summary = analysis["experiment_summary"]
|
|
print(f"\nBaseline: {b_summary['successful_tests']}/{b_summary['total_tests']} tests ({b_summary['success_rate']:.1f}% success)")
|
|
print(f"Experiment: {e_summary['successful_tests']}/{e_summary['total_tests']} tests ({e_summary['success_rate']:.1f}% success)")
|
|
|
|
print("=" * 70)
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Compare RAGAS evaluation results from two runs",
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog="""
|
|
Examples:
|
|
# Compare baseline vs experiment
|
|
python lightrag/evaluation/compare_results.py baseline.json experiment.json
|
|
|
|
# Save comparison to file
|
|
python lightrag/evaluation/compare_results.py baseline.json experiment.json --output comparison.json
|
|
|
|
# Compare with/without orphan connections
|
|
python lightrag/evaluation/compare_results.py results_without_orphans.json results_with_orphans.json
|
|
""",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"baseline",
|
|
type=str,
|
|
help="Path to baseline results JSON file",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"experiment",
|
|
type=str,
|
|
help="Path to experiment results JSON file",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--output",
|
|
"-o",
|
|
type=str,
|
|
default=None,
|
|
help="Output path for comparison JSON (optional)",
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
baseline_path = Path(args.baseline)
|
|
experiment_path = Path(args.experiment)
|
|
|
|
# Validate files exist
|
|
if not baseline_path.exists():
|
|
print(f"Error: Baseline file not found: {baseline_path}")
|
|
sys.exit(1)
|
|
if not experiment_path.exists():
|
|
print(f"Error: Experiment file not found: {experiment_path}")
|
|
sys.exit(1)
|
|
|
|
# Run analysis
|
|
analysis = analyze_results(baseline_path, experiment_path)
|
|
|
|
# Print report
|
|
print_comparison_report(analysis)
|
|
|
|
# Save to file if requested
|
|
if args.output:
|
|
output_path = Path(args.output)
|
|
with open(output_path, "w") as f:
|
|
json.dump(analysis, f, indent=2)
|
|
print(f"\nComparison saved to: {output_path}")
|
|
|
|
# Exit with status based on verdict
|
|
if analysis["verdict"] in ("SIGNIFICANT_REGRESSION",):
|
|
sys.exit(1)
|
|
sys.exit(0)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|