#!/usr/bin/env python3 """ A/B Test Results Comparator for RAGAS Evaluation Compares two RAGAS evaluation result files to determine if a change (e.g., orphan connections) improved or degraded retrieval quality. Usage: python lightrag/evaluation/compare_results.py baseline.json experiment.json python lightrag/evaluation/compare_results.py results_a.json results_b.json --output comparison.json """ import argparse import json import math import sys from dataclasses import dataclass from datetime import datetime from pathlib import Path from typing import Any from lightrag.utils import logger @dataclass class MetricComparison: """Comparison of a single metric between two runs.""" metric_name: str baseline_value: float experiment_value: float absolute_change: float relative_change_percent: float improved: bool significant: bool # > 5% change def safe_float(value: Any, default: float = 0.0) -> float: """Safely convert a value to float, handling NaN.""" if value is None: return default try: f = float(value) if math.isnan(f): return default return f except (ValueError, TypeError): return default def compare_metrics(baseline: dict, experiment: dict) -> list[MetricComparison]: """ Compare metrics between baseline and experiment. Args: baseline: Benchmark stats from baseline run experiment: Benchmark stats from experiment run Returns: List of MetricComparison objects """ comparisons = [] baseline_avg = baseline.get('average_metrics', {}) experiment_avg = experiment.get('average_metrics', {}) metrics_to_compare = [ ('faithfulness', 'Faithfulness'), ('answer_relevance', 'Answer Relevance'), ('context_recall', 'Context Recall'), ('context_precision', 'Context Precision'), ('ragas_score', 'RAGAS Score'), ] for metric_key, metric_name in metrics_to_compare: b_val = safe_float(baseline_avg.get(metric_key, 0)) e_val = safe_float(experiment_avg.get(metric_key, 0)) abs_change = e_val - b_val rel_change = (abs_change / b_val * 100) if b_val > 0 else 0 comparisons.append( MetricComparison( metric_name=metric_name, baseline_value=b_val, experiment_value=e_val, absolute_change=abs_change, relative_change_percent=rel_change, improved=abs_change > 0, significant=abs(rel_change) > 5, # > 5% is significant ) ) return comparisons def analyze_results(baseline_path: Path, experiment_path: Path) -> dict: """ Perform comprehensive A/B analysis. Args: baseline_path: Path to baseline results JSON experiment_path: Path to experiment results JSON Returns: Analysis results dictionary """ # Load results with open(baseline_path) as f: baseline = json.load(f) with open(experiment_path) as f: experiment = json.load(f) baseline_stats = baseline.get('benchmark_stats', {}) experiment_stats = experiment.get('benchmark_stats', {}) # Compare metrics comparisons = compare_metrics(baseline_stats, experiment_stats) # Calculate overall verdict improvements = sum(1 for c in comparisons if c.improved) regressions = sum(1 for c in comparisons if not c.improved and c.absolute_change != 0) significant_improvements = sum(1 for c in comparisons if c.improved and c.significant) significant_regressions = sum(1 for c in comparisons if not c.improved and c.significant) # Determine verdict ragas_comparison = next((c for c in comparisons if c.metric_name == 'RAGAS Score'), None) if ragas_comparison: if ragas_comparison.improved and ragas_comparison.significant: verdict = 'SIGNIFICANT_IMPROVEMENT' verdict_description = f'RAGAS Score improved by {ragas_comparison.relative_change_percent:.1f}%' elif ragas_comparison.improved: verdict = 'MINOR_IMPROVEMENT' verdict_description = f'RAGAS Score slightly improved by {ragas_comparison.relative_change_percent:.1f}%' elif ragas_comparison.significant: verdict = 'SIGNIFICANT_REGRESSION' verdict_description = f'RAGAS Score regressed by {abs(ragas_comparison.relative_change_percent):.1f}%' elif ragas_comparison.absolute_change == 0: verdict = 'NO_CHANGE' verdict_description = 'No measurable difference between runs' else: verdict = 'MINOR_REGRESSION' verdict_description = ( f'RAGAS Score slightly regressed by {abs(ragas_comparison.relative_change_percent):.1f}%' ) else: verdict = 'UNKNOWN' verdict_description = 'Could not determine RAGAS score comparison' return { 'analysis_timestamp': datetime.now().isoformat(), 'baseline_file': str(baseline_path), 'experiment_file': str(experiment_path), 'verdict': verdict, 'verdict_description': verdict_description, 'summary': { 'metrics_improved': improvements, 'metrics_regressed': regressions, 'significant_improvements': significant_improvements, 'significant_regressions': significant_regressions, }, 'metrics': [ { 'name': c.metric_name, 'baseline': round(c.baseline_value, 4), 'experiment': round(c.experiment_value, 4), 'change': round(c.absolute_change, 4), 'change_percent': round(c.relative_change_percent, 2), 'improved': c.improved, 'significant': c.significant, } for c in comparisons ], 'baseline_summary': { 'total_tests': baseline_stats.get('total_tests', 0), 'successful_tests': baseline_stats.get('successful_tests', 0), 'success_rate': baseline_stats.get('success_rate', 0), }, 'experiment_summary': { 'total_tests': experiment_stats.get('total_tests', 0), 'successful_tests': experiment_stats.get('successful_tests', 0), 'success_rate': experiment_stats.get('success_rate', 0), }, } def print_comparison_report(analysis: dict): """Print a formatted comparison report to stdout.""" print('=' * 70) print('A/B TEST COMPARISON REPORT') print('=' * 70) print(f'Baseline: {analysis["baseline_file"]}') print(f'Experiment: {analysis["experiment_file"]}') print('-' * 70) # Verdict verdict = analysis['verdict'] verdict_icon = { 'SIGNIFICANT_IMPROVEMENT': 'PASS', 'MINOR_IMPROVEMENT': 'PASS', 'NO_CHANGE': '~', 'MINOR_REGRESSION': 'WARN', 'SIGNIFICANT_REGRESSION': 'FAIL', 'UNKNOWN': '?', }.get(verdict, '?') print(f'\n[{verdict_icon}] VERDICT: {verdict}') print(f' {analysis["verdict_description"]}') # Metrics table print('\n' + '-' * 70) print(f'{"Metric":<20} {"Baseline":>10} {"Experiment":>10} {"Change":>10} {"Status":>10}') print('-' * 70) for metric in analysis['metrics']: name = metric['name'] baseline = f'{metric["baseline"]:.4f}' experiment = f'{metric["experiment"]:.4f}' change = metric['change'] change_pct = metric['change_percent'] if change > 0: change_str = f'+{change:.4f}' status = f'+{change_pct:.1f}%' elif change < 0: change_str = f'{change:.4f}' status = f'{change_pct:.1f}%' else: change_str = '0.0000' status = '0.0%' if metric['significant']: status = f'[UP] {status}' if metric['improved'] else f'[DOWN] {status}' else: status = f' {status}' print(f'{name:<20} {baseline:>10} {experiment:>10} {change_str:>10} {status:>10}') print('-' * 70) # Summary summary = analysis['summary'] print(f'\nSummary: {summary["metrics_improved"]} improved, {summary["metrics_regressed"]} regressed') print( f' {summary["significant_improvements"]} significant improvements, {summary["significant_regressions"]} significant regressions' ) # Test counts b_summary = analysis['baseline_summary'] e_summary = analysis['experiment_summary'] print( f'\nBaseline: {b_summary["successful_tests"]}/{b_summary["total_tests"]} tests ({b_summary["success_rate"]:.1f}% success)' ) print( f'Experiment: {e_summary["successful_tests"]}/{e_summary["total_tests"]} tests ({e_summary["success_rate"]:.1f}% success)' ) print('=' * 70) def main(): parser = argparse.ArgumentParser( description='Compare RAGAS evaluation results from two runs', formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: # Compare baseline vs experiment python lightrag/evaluation/compare_results.py baseline.json experiment.json # Save comparison to file python lightrag/evaluation/compare_results.py baseline.json experiment.json --output comparison.json # Compare with/without orphan connections python lightrag/evaluation/compare_results.py results_without_orphans.json results_with_orphans.json """, ) parser.add_argument( 'baseline', type=str, help='Path to baseline results JSON file', ) parser.add_argument( 'experiment', type=str, help='Path to experiment results JSON file', ) parser.add_argument( '--output', '-o', type=str, default=None, help='Output path for comparison JSON (optional)', ) args = parser.parse_args() baseline_path = Path(args.baseline) experiment_path = Path(args.experiment) # Validate files exist if not baseline_path.exists(): logger.error('Error: Baseline file not found: %s', baseline_path) sys.exit(1) if not experiment_path.exists(): logger.error('Error: Experiment file not found: %s', experiment_path) sys.exit(1) # Run analysis analysis = analyze_results(baseline_path, experiment_path) # Print report print_comparison_report(analysis) # Save to file if requested if args.output: output_path = Path(args.output) with open(output_path, 'w') as f: json.dump(analysis, f, indent=2) logger.info('Comparison saved to: %s', output_path) # Exit with status based on verdict if analysis['verdict'] in ('SIGNIFICANT_REGRESSION',): sys.exit(1) sys.exit(0) if __name__ == '__main__': main()