LightRAG/lightrag/evaluation/compare_results.py
clssck 69358d830d test(lightrag,examples,api): comprehensive ruff formatting and type hints
Format entire codebase with ruff and add type hints across all modules:
- Apply ruff formatting to all Python files (121 files, 17K insertions)
- Add type hints to function signatures throughout lightrag core and API
- Update test suite with improved type annotations and docstrings
- Add pyrightconfig.json for static type checking configuration
- Create prompt_optimized.py and test_extraction_prompt_ab.py test files
- Update ruff.toml and .gitignore for improved linting configuration
- Standardize code style across examples, reproduce scripts, and utilities
2025-12-05 15:17:06 +01:00

330 lines
11 KiB
Python

#!/usr/bin/env python3
"""
A/B Test Results Comparator for RAGAS Evaluation
Compares two RAGAS evaluation result files to determine if a change
(e.g., orphan connections) improved or degraded retrieval quality.
Usage:
python lightrag/evaluation/compare_results.py baseline.json experiment.json
python lightrag/evaluation/compare_results.py results_a.json results_b.json --output comparison.json
"""
import argparse
import json
import math
import sys
from dataclasses import dataclass
from datetime import datetime
from pathlib import Path
from typing import Any
@dataclass
class MetricComparison:
"""Comparison of a single metric between two runs."""
metric_name: str
baseline_value: float
experiment_value: float
absolute_change: float
relative_change_percent: float
improved: bool
significant: bool # > 5% change
def safe_float(value: Any, default: float = 0.0) -> float:
"""Safely convert a value to float, handling NaN."""
if value is None:
return default
try:
f = float(value)
if math.isnan(f):
return default
return f
except (ValueError, TypeError):
return default
def compare_metrics(baseline: dict, experiment: dict) -> list[MetricComparison]:
"""
Compare metrics between baseline and experiment.
Args:
baseline: Benchmark stats from baseline run
experiment: Benchmark stats from experiment run
Returns:
List of MetricComparison objects
"""
comparisons = []
baseline_avg = baseline.get('average_metrics', {})
experiment_avg = experiment.get('average_metrics', {})
metrics_to_compare = [
('faithfulness', 'Faithfulness'),
('answer_relevance', 'Answer Relevance'),
('context_recall', 'Context Recall'),
('context_precision', 'Context Precision'),
('ragas_score', 'RAGAS Score'),
]
for metric_key, metric_name in metrics_to_compare:
b_val = safe_float(baseline_avg.get(metric_key, 0))
e_val = safe_float(experiment_avg.get(metric_key, 0))
abs_change = e_val - b_val
rel_change = (abs_change / b_val * 100) if b_val > 0 else 0
comparisons.append(
MetricComparison(
metric_name=metric_name,
baseline_value=b_val,
experiment_value=e_val,
absolute_change=abs_change,
relative_change_percent=rel_change,
improved=abs_change > 0,
significant=abs(rel_change) > 5, # > 5% is significant
)
)
return comparisons
def analyze_results(baseline_path: Path, experiment_path: Path) -> dict:
"""
Perform comprehensive A/B analysis.
Args:
baseline_path: Path to baseline results JSON
experiment_path: Path to experiment results JSON
Returns:
Analysis results dictionary
"""
# Load results
with open(baseline_path) as f:
baseline = json.load(f)
with open(experiment_path) as f:
experiment = json.load(f)
baseline_stats = baseline.get('benchmark_stats', {})
experiment_stats = experiment.get('benchmark_stats', {})
# Compare metrics
comparisons = compare_metrics(baseline_stats, experiment_stats)
# Calculate overall verdict
improvements = sum(1 for c in comparisons if c.improved)
regressions = sum(1 for c in comparisons if not c.improved and c.absolute_change != 0)
significant_improvements = sum(1 for c in comparisons if c.improved and c.significant)
significant_regressions = sum(1 for c in comparisons if not c.improved and c.significant)
# Determine verdict
ragas_comparison = next((c for c in comparisons if c.metric_name == 'RAGAS Score'), None)
if ragas_comparison:
if ragas_comparison.improved and ragas_comparison.significant:
verdict = 'SIGNIFICANT_IMPROVEMENT'
verdict_description = f'RAGAS Score improved by {ragas_comparison.relative_change_percent:.1f}%'
elif ragas_comparison.improved:
verdict = 'MINOR_IMPROVEMENT'
verdict_description = f'RAGAS Score slightly improved by {ragas_comparison.relative_change_percent:.1f}%'
elif ragas_comparison.significant:
verdict = 'SIGNIFICANT_REGRESSION'
verdict_description = f'RAGAS Score regressed by {abs(ragas_comparison.relative_change_percent):.1f}%'
elif ragas_comparison.absolute_change == 0:
verdict = 'NO_CHANGE'
verdict_description = 'No measurable difference between runs'
else:
verdict = 'MINOR_REGRESSION'
verdict_description = (
f'RAGAS Score slightly regressed by {abs(ragas_comparison.relative_change_percent):.1f}%'
)
else:
verdict = 'UNKNOWN'
verdict_description = 'Could not determine RAGAS score comparison'
return {
'analysis_timestamp': datetime.now().isoformat(),
'baseline_file': str(baseline_path),
'experiment_file': str(experiment_path),
'verdict': verdict,
'verdict_description': verdict_description,
'summary': {
'metrics_improved': improvements,
'metrics_regressed': regressions,
'significant_improvements': significant_improvements,
'significant_regressions': significant_regressions,
},
'metrics': [
{
'name': c.metric_name,
'baseline': round(c.baseline_value, 4),
'experiment': round(c.experiment_value, 4),
'change': round(c.absolute_change, 4),
'change_percent': round(c.relative_change_percent, 2),
'improved': c.improved,
'significant': c.significant,
}
for c in comparisons
],
'baseline_summary': {
'total_tests': baseline_stats.get('total_tests', 0),
'successful_tests': baseline_stats.get('successful_tests', 0),
'success_rate': baseline_stats.get('success_rate', 0),
},
'experiment_summary': {
'total_tests': experiment_stats.get('total_tests', 0),
'successful_tests': experiment_stats.get('successful_tests', 0),
'success_rate': experiment_stats.get('success_rate', 0),
},
}
def print_comparison_report(analysis: dict):
"""Print a formatted comparison report to stdout."""
print('=' * 70)
print('A/B TEST COMPARISON REPORT')
print('=' * 70)
print(f'Baseline: {analysis["baseline_file"]}')
print(f'Experiment: {analysis["experiment_file"]}')
print('-' * 70)
# Verdict
verdict = analysis['verdict']
verdict_icon = {
'SIGNIFICANT_IMPROVEMENT': 'PASS',
'MINOR_IMPROVEMENT': 'PASS',
'NO_CHANGE': '~',
'MINOR_REGRESSION': 'WARN',
'SIGNIFICANT_REGRESSION': 'FAIL',
'UNKNOWN': '?',
}.get(verdict, '?')
print(f'\n[{verdict_icon}] VERDICT: {verdict}')
print(f' {analysis["verdict_description"]}')
# Metrics table
print('\n' + '-' * 70)
print(f'{"Metric":<20} {"Baseline":>10} {"Experiment":>10} {"Change":>10} {"Status":>10}')
print('-' * 70)
for metric in analysis['metrics']:
name = metric['name']
baseline = f'{metric["baseline"]:.4f}'
experiment = f'{metric["experiment"]:.4f}'
change = metric['change']
change_pct = metric['change_percent']
if change > 0:
change_str = f'+{change:.4f}'
status = f'+{change_pct:.1f}%'
elif change < 0:
change_str = f'{change:.4f}'
status = f'{change_pct:.1f}%'
else:
change_str = '0.0000'
status = '0.0%'
if metric['significant']:
status = f'[UP] {status}' if metric['improved'] else f'[DOWN] {status}'
else:
status = f' {status}'
print(f'{name:<20} {baseline:>10} {experiment:>10} {change_str:>10} {status:>10}')
print('-' * 70)
# Summary
summary = analysis['summary']
print(f'\nSummary: {summary["metrics_improved"]} improved, {summary["metrics_regressed"]} regressed')
print(
f' {summary["significant_improvements"]} significant improvements, {summary["significant_regressions"]} significant regressions'
)
# Test counts
b_summary = analysis['baseline_summary']
e_summary = analysis['experiment_summary']
print(
f'\nBaseline: {b_summary["successful_tests"]}/{b_summary["total_tests"]} tests ({b_summary["success_rate"]:.1f}% success)'
)
print(
f'Experiment: {e_summary["successful_tests"]}/{e_summary["total_tests"]} tests ({e_summary["success_rate"]:.1f}% success)'
)
print('=' * 70)
def main():
parser = argparse.ArgumentParser(
description='Compare RAGAS evaluation results from two runs',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Compare baseline vs experiment
python lightrag/evaluation/compare_results.py baseline.json experiment.json
# Save comparison to file
python lightrag/evaluation/compare_results.py baseline.json experiment.json --output comparison.json
# Compare with/without orphan connections
python lightrag/evaluation/compare_results.py results_without_orphans.json results_with_orphans.json
""",
)
parser.add_argument(
'baseline',
type=str,
help='Path to baseline results JSON file',
)
parser.add_argument(
'experiment',
type=str,
help='Path to experiment results JSON file',
)
parser.add_argument(
'--output',
'-o',
type=str,
default=None,
help='Output path for comparison JSON (optional)',
)
args = parser.parse_args()
baseline_path = Path(args.baseline)
experiment_path = Path(args.experiment)
# Validate files exist
if not baseline_path.exists():
print(f'Error: Baseline file not found: {baseline_path}')
sys.exit(1)
if not experiment_path.exists():
print(f'Error: Experiment file not found: {experiment_path}')
sys.exit(1)
# Run analysis
analysis = analyze_results(baseline_path, experiment_path)
# Print report
print_comparison_report(analysis)
# Save to file if requested
if args.output:
output_path = Path(args.output)
with open(output_path, 'w') as f:
json.dump(analysis, f, indent=2)
print(f'\nComparison saved to: {output_path}')
# Exit with status based on verdict
if analysis['verdict'] in ('SIGNIFICANT_REGRESSION',):
sys.exit(1)
sys.exit(0)
if __name__ == '__main__':
main()