#!/usr/bin/env python3 """ A/B Test Results Comparator for RAGAS Evaluation Compares two RAGAS evaluation result files to determine if a change (e.g., orphan connections) improved or degraded retrieval quality. Usage: python lightrag/evaluation/compare_results.py baseline.json experiment.json python lightrag/evaluation/compare_results.py results_a.json results_b.json --output comparison.json """ import argparse import json import math import sys from dataclasses import dataclass from datetime import datetime from pathlib import Path from typing import Any @dataclass class MetricComparison: """Comparison of a single metric between two runs.""" metric_name: str baseline_value: float experiment_value: float absolute_change: float relative_change_percent: float improved: bool significant: bool # > 5% change def safe_float(value: Any, default: float = 0.0) -> float: """Safely convert a value to float, handling NaN.""" if value is None: return default try: f = float(value) if math.isnan(f): return default return f except (ValueError, TypeError): return default def compare_metrics(baseline: dict, experiment: dict) -> list[MetricComparison]: """ Compare metrics between baseline and experiment. Args: baseline: Benchmark stats from baseline run experiment: Benchmark stats from experiment run Returns: List of MetricComparison objects """ comparisons = [] baseline_avg = baseline.get('average_metrics', {}) experiment_avg = experiment.get('average_metrics', {}) metrics_to_compare = [ ('faithfulness', 'Faithfulness'), ('answer_relevance', 'Answer Relevance'), ('context_recall', 'Context Recall'), ('context_precision', 'Context Precision'), ('ragas_score', 'RAGAS Score'), ] for metric_key, metric_name in metrics_to_compare: b_val = safe_float(baseline_avg.get(metric_key, 0)) e_val = safe_float(experiment_avg.get(metric_key, 0)) abs_change = e_val - b_val rel_change = (abs_change / b_val * 100) if b_val > 0 else 0 comparisons.append( MetricComparison( metric_name=metric_name, baseline_value=b_val, experiment_value=e_val, absolute_change=abs_change, relative_change_percent=rel_change, improved=abs_change > 0, significant=abs(rel_change) > 5, # > 5% is significant ) ) return comparisons def analyze_results(baseline_path: Path, experiment_path: Path) -> dict: """ Perform comprehensive A/B analysis. Args: baseline_path: Path to baseline results JSON experiment_path: Path to experiment results JSON Returns: Analysis results dictionary """ # Load results with open(baseline_path) as f: baseline = json.load(f) with open(experiment_path) as f: experiment = json.load(f) baseline_stats = baseline.get('benchmark_stats', {}) experiment_stats = experiment.get('benchmark_stats', {}) # Compare metrics comparisons = compare_metrics(baseline_stats, experiment_stats) # Calculate overall verdict improvements = sum(1 for c in comparisons if c.improved) regressions = sum(1 for c in comparisons if not c.improved and c.absolute_change != 0) significant_improvements = sum(1 for c in comparisons if c.improved and c.significant) significant_regressions = sum(1 for c in comparisons if not c.improved and c.significant) # Determine verdict ragas_comparison = next((c for c in comparisons if c.metric_name == 'RAGAS Score'), None) if ragas_comparison: if ragas_comparison.improved and ragas_comparison.significant: verdict = 'SIGNIFICANT_IMPROVEMENT' verdict_description = f'RAGAS Score improved by {ragas_comparison.relative_change_percent:.1f}%' elif ragas_comparison.improved: verdict = 'MINOR_IMPROVEMENT' verdict_description = f'RAGAS Score slightly improved by {ragas_comparison.relative_change_percent:.1f}%' elif ragas_comparison.significant: verdict = 'SIGNIFICANT_REGRESSION' verdict_description = f'RAGAS Score regressed by {abs(ragas_comparison.relative_change_percent):.1f}%' elif ragas_comparison.absolute_change == 0: verdict = 'NO_CHANGE' verdict_description = 'No measurable difference between runs' else: verdict = 'MINOR_REGRESSION' verdict_description = ( f'RAGAS Score slightly regressed by {abs(ragas_comparison.relative_change_percent):.1f}%' ) else: verdict = 'UNKNOWN' verdict_description = 'Could not determine RAGAS score comparison' return { 'analysis_timestamp': datetime.now().isoformat(), 'baseline_file': str(baseline_path), 'experiment_file': str(experiment_path), 'verdict': verdict, 'verdict_description': verdict_description, 'summary': { 'metrics_improved': improvements, 'metrics_regressed': regressions, 'significant_improvements': significant_improvements, 'significant_regressions': significant_regressions, }, 'metrics': [ { 'name': c.metric_name, 'baseline': round(c.baseline_value, 4), 'experiment': round(c.experiment_value, 4), 'change': round(c.absolute_change, 4), 'change_percent': round(c.relative_change_percent, 2), 'improved': c.improved, 'significant': c.significant, } for c in comparisons ], 'baseline_summary': { 'total_tests': baseline_stats.get('total_tests', 0), 'successful_tests': baseline_stats.get('successful_tests', 0), 'success_rate': baseline_stats.get('success_rate', 0), }, 'experiment_summary': { 'total_tests': experiment_stats.get('total_tests', 0), 'successful_tests': experiment_stats.get('successful_tests', 0), 'success_rate': experiment_stats.get('success_rate', 0), }, } def print_comparison_report(analysis: dict): """Print a formatted comparison report to stdout.""" print('=' * 70) print('A/B TEST COMPARISON REPORT') print('=' * 70) print(f'Baseline: {analysis["baseline_file"]}') print(f'Experiment: {analysis["experiment_file"]}') print('-' * 70) # Verdict verdict = analysis['verdict'] verdict_icon = { 'SIGNIFICANT_IMPROVEMENT': 'PASS', 'MINOR_IMPROVEMENT': 'PASS', 'NO_CHANGE': '~', 'MINOR_REGRESSION': 'WARN', 'SIGNIFICANT_REGRESSION': 'FAIL', 'UNKNOWN': '?', }.get(verdict, '?') print(f'\n[{verdict_icon}] VERDICT: {verdict}') print(f' {analysis["verdict_description"]}') # Metrics table print('\n' + '-' * 70) print(f'{"Metric":<20} {"Baseline":>10} {"Experiment":>10} {"Change":>10} {"Status":>10}') print('-' * 70) for metric in analysis['metrics']: name = metric['name'] baseline = f'{metric["baseline"]:.4f}' experiment = f'{metric["experiment"]:.4f}' change = metric['change'] change_pct = metric['change_percent'] if change > 0: change_str = f'+{change:.4f}' status = f'+{change_pct:.1f}%' elif change < 0: change_str = f'{change:.4f}' status = f'{change_pct:.1f}%' else: change_str = '0.0000' status = '0.0%' if metric['significant']: status = f'[UP] {status}' if metric['improved'] else f'[DOWN] {status}' else: status = f' {status}' print(f'{name:<20} {baseline:>10} {experiment:>10} {change_str:>10} {status:>10}') print('-' * 70) # Summary summary = analysis['summary'] print(f'\nSummary: {summary["metrics_improved"]} improved, {summary["metrics_regressed"]} regressed') print( f' {summary["significant_improvements"]} significant improvements, {summary["significant_regressions"]} significant regressions' ) # Test counts b_summary = analysis['baseline_summary'] e_summary = analysis['experiment_summary'] print( f'\nBaseline: {b_summary["successful_tests"]}/{b_summary["total_tests"]} tests ({b_summary["success_rate"]:.1f}% success)' ) print( f'Experiment: {e_summary["successful_tests"]}/{e_summary["total_tests"]} tests ({e_summary["success_rate"]:.1f}% success)' ) print('=' * 70) def main(): parser = argparse.ArgumentParser( description='Compare RAGAS evaluation results from two runs', formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: # Compare baseline vs experiment python lightrag/evaluation/compare_results.py baseline.json experiment.json # Save comparison to file python lightrag/evaluation/compare_results.py baseline.json experiment.json --output comparison.json # Compare with/without orphan connections python lightrag/evaluation/compare_results.py results_without_orphans.json results_with_orphans.json """, ) parser.add_argument( 'baseline', type=str, help='Path to baseline results JSON file', ) parser.add_argument( 'experiment', type=str, help='Path to experiment results JSON file', ) parser.add_argument( '--output', '-o', type=str, default=None, help='Output path for comparison JSON (optional)', ) args = parser.parse_args() baseline_path = Path(args.baseline) experiment_path = Path(args.experiment) # Validate files exist if not baseline_path.exists(): print(f'Error: Baseline file not found: {baseline_path}') sys.exit(1) if not experiment_path.exists(): print(f'Error: Experiment file not found: {experiment_path}') sys.exit(1) # Run analysis analysis = analyze_results(baseline_path, experiment_path) # Print report print_comparison_report(analysis) # Save to file if requested if args.output: output_path = Path(args.output) with open(output_path, 'w') as f: json.dump(analysis, f, indent=2) print(f'\nComparison saved to: {output_path}') # Exit with status based on verdict if analysis['verdict'] in ('SIGNIFICANT_REGRESSION',): sys.exit(1) sys.exit(0) if __name__ == '__main__': main()