LightRAG/lightrag/evaluation/compare_results.py
clssck 59e89772de refactor: consolidate to PostgreSQL-only backend and modernize stack
Remove legacy storage implementations and deprecated examples:
- Delete FAISS, JSON, Memgraph, Milvus, MongoDB, Nano Vector DB, Neo4j, NetworkX, Qdrant, Redis storage backends
- Remove Kubernetes deployment manifests and installation scripts
- Delete unofficial examples for deprecated backends and offline deployment docs
Streamline core infrastructure:
- Consolidate storage layer to PostgreSQL-only implementation
- Add full-text search caching with FTS cache module
- Implement metrics collection and monitoring pipeline
- Add explain and metrics API routes
Modernize frontend and tooling:
- Switch web UI to Bun with bun.lock, remove npm and pnpm lockfiles
- Update Dockerfile for PostgreSQL-only deployment
- Add Makefile for common development tasks
- Update environment and configuration examples
Enhance evaluation and testing capabilities:
- Add prompt optimization with DSPy and auto-tuning
- Implement ground truth regeneration and variant testing
- Add prompt debugging and response comparison utilities
- Expand test coverage with new integration scenarios
Simplify dependencies and configuration:
- Remove offline-specific requirement files
- Update pyproject.toml with streamlined dependencies
- Add Python version pinning with .python-version
- Create project guidelines in CLAUDE.md and AGENTS.md
2025-12-12 16:28:49 +01:00

332 lines
11 KiB
Python

#!/usr/bin/env python3
"""
A/B Test Results Comparator for RAGAS Evaluation
Compares two RAGAS evaluation result files to determine if a change
(e.g., orphan connections) improved or degraded retrieval quality.
Usage:
python lightrag/evaluation/compare_results.py baseline.json experiment.json
python lightrag/evaluation/compare_results.py results_a.json results_b.json --output comparison.json
"""
import argparse
import json
import math
import sys
from dataclasses import dataclass
from datetime import datetime
from pathlib import Path
from typing import Any
from lightrag.utils import logger
@dataclass
class MetricComparison:
"""Comparison of a single metric between two runs."""
metric_name: str
baseline_value: float
experiment_value: float
absolute_change: float
relative_change_percent: float
improved: bool
significant: bool # > 5% change
def safe_float(value: Any, default: float = 0.0) -> float:
"""Safely convert a value to float, handling NaN."""
if value is None:
return default
try:
f = float(value)
if math.isnan(f):
return default
return f
except (ValueError, TypeError):
return default
def compare_metrics(baseline: dict, experiment: dict) -> list[MetricComparison]:
"""
Compare metrics between baseline and experiment.
Args:
baseline: Benchmark stats from baseline run
experiment: Benchmark stats from experiment run
Returns:
List of MetricComparison objects
"""
comparisons = []
baseline_avg = baseline.get('average_metrics', {})
experiment_avg = experiment.get('average_metrics', {})
metrics_to_compare = [
('faithfulness', 'Faithfulness'),
('answer_relevance', 'Answer Relevance'),
('context_recall', 'Context Recall'),
('context_precision', 'Context Precision'),
('ragas_score', 'RAGAS Score'),
]
for metric_key, metric_name in metrics_to_compare:
b_val = safe_float(baseline_avg.get(metric_key, 0))
e_val = safe_float(experiment_avg.get(metric_key, 0))
abs_change = e_val - b_val
rel_change = (abs_change / b_val * 100) if b_val > 0 else 0
comparisons.append(
MetricComparison(
metric_name=metric_name,
baseline_value=b_val,
experiment_value=e_val,
absolute_change=abs_change,
relative_change_percent=rel_change,
improved=abs_change > 0,
significant=abs(rel_change) > 5, # > 5% is significant
)
)
return comparisons
def analyze_results(baseline_path: Path, experiment_path: Path) -> dict:
"""
Perform comprehensive A/B analysis.
Args:
baseline_path: Path to baseline results JSON
experiment_path: Path to experiment results JSON
Returns:
Analysis results dictionary
"""
# Load results
with open(baseline_path) as f:
baseline = json.load(f)
with open(experiment_path) as f:
experiment = json.load(f)
baseline_stats = baseline.get('benchmark_stats', {})
experiment_stats = experiment.get('benchmark_stats', {})
# Compare metrics
comparisons = compare_metrics(baseline_stats, experiment_stats)
# Calculate overall verdict
improvements = sum(1 for c in comparisons if c.improved)
regressions = sum(1 for c in comparisons if not c.improved and c.absolute_change != 0)
significant_improvements = sum(1 for c in comparisons if c.improved and c.significant)
significant_regressions = sum(1 for c in comparisons if not c.improved and c.significant)
# Determine verdict
ragas_comparison = next((c for c in comparisons if c.metric_name == 'RAGAS Score'), None)
if ragas_comparison:
if ragas_comparison.improved and ragas_comparison.significant:
verdict = 'SIGNIFICANT_IMPROVEMENT'
verdict_description = f'RAGAS Score improved by {ragas_comparison.relative_change_percent:.1f}%'
elif ragas_comparison.improved:
verdict = 'MINOR_IMPROVEMENT'
verdict_description = f'RAGAS Score slightly improved by {ragas_comparison.relative_change_percent:.1f}%'
elif ragas_comparison.significant:
verdict = 'SIGNIFICANT_REGRESSION'
verdict_description = f'RAGAS Score regressed by {abs(ragas_comparison.relative_change_percent):.1f}%'
elif ragas_comparison.absolute_change == 0:
verdict = 'NO_CHANGE'
verdict_description = 'No measurable difference between runs'
else:
verdict = 'MINOR_REGRESSION'
verdict_description = (
f'RAGAS Score slightly regressed by {abs(ragas_comparison.relative_change_percent):.1f}%'
)
else:
verdict = 'UNKNOWN'
verdict_description = 'Could not determine RAGAS score comparison'
return {
'analysis_timestamp': datetime.now().isoformat(),
'baseline_file': str(baseline_path),
'experiment_file': str(experiment_path),
'verdict': verdict,
'verdict_description': verdict_description,
'summary': {
'metrics_improved': improvements,
'metrics_regressed': regressions,
'significant_improvements': significant_improvements,
'significant_regressions': significant_regressions,
},
'metrics': [
{
'name': c.metric_name,
'baseline': round(c.baseline_value, 4),
'experiment': round(c.experiment_value, 4),
'change': round(c.absolute_change, 4),
'change_percent': round(c.relative_change_percent, 2),
'improved': c.improved,
'significant': c.significant,
}
for c in comparisons
],
'baseline_summary': {
'total_tests': baseline_stats.get('total_tests', 0),
'successful_tests': baseline_stats.get('successful_tests', 0),
'success_rate': baseline_stats.get('success_rate', 0),
},
'experiment_summary': {
'total_tests': experiment_stats.get('total_tests', 0),
'successful_tests': experiment_stats.get('successful_tests', 0),
'success_rate': experiment_stats.get('success_rate', 0),
},
}
def print_comparison_report(analysis: dict):
"""Print a formatted comparison report to stdout."""
print('=' * 70)
print('A/B TEST COMPARISON REPORT')
print('=' * 70)
print(f'Baseline: {analysis["baseline_file"]}')
print(f'Experiment: {analysis["experiment_file"]}')
print('-' * 70)
# Verdict
verdict = analysis['verdict']
verdict_icon = {
'SIGNIFICANT_IMPROVEMENT': 'PASS',
'MINOR_IMPROVEMENT': 'PASS',
'NO_CHANGE': '~',
'MINOR_REGRESSION': 'WARN',
'SIGNIFICANT_REGRESSION': 'FAIL',
'UNKNOWN': '?',
}.get(verdict, '?')
print(f'\n[{verdict_icon}] VERDICT: {verdict}')
print(f' {analysis["verdict_description"]}')
# Metrics table
print('\n' + '-' * 70)
print(f'{"Metric":<20} {"Baseline":>10} {"Experiment":>10} {"Change":>10} {"Status":>10}')
print('-' * 70)
for metric in analysis['metrics']:
name = metric['name']
baseline = f'{metric["baseline"]:.4f}'
experiment = f'{metric["experiment"]:.4f}'
change = metric['change']
change_pct = metric['change_percent']
if change > 0:
change_str = f'+{change:.4f}'
status = f'+{change_pct:.1f}%'
elif change < 0:
change_str = f'{change:.4f}'
status = f'{change_pct:.1f}%'
else:
change_str = '0.0000'
status = '0.0%'
if metric['significant']:
status = f'[UP] {status}' if metric['improved'] else f'[DOWN] {status}'
else:
status = f' {status}'
print(f'{name:<20} {baseline:>10} {experiment:>10} {change_str:>10} {status:>10}')
print('-' * 70)
# Summary
summary = analysis['summary']
print(f'\nSummary: {summary["metrics_improved"]} improved, {summary["metrics_regressed"]} regressed')
print(
f' {summary["significant_improvements"]} significant improvements, {summary["significant_regressions"]} significant regressions'
)
# Test counts
b_summary = analysis['baseline_summary']
e_summary = analysis['experiment_summary']
print(
f'\nBaseline: {b_summary["successful_tests"]}/{b_summary["total_tests"]} tests ({b_summary["success_rate"]:.1f}% success)'
)
print(
f'Experiment: {e_summary["successful_tests"]}/{e_summary["total_tests"]} tests ({e_summary["success_rate"]:.1f}% success)'
)
print('=' * 70)
def main():
parser = argparse.ArgumentParser(
description='Compare RAGAS evaluation results from two runs',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Compare baseline vs experiment
python lightrag/evaluation/compare_results.py baseline.json experiment.json
# Save comparison to file
python lightrag/evaluation/compare_results.py baseline.json experiment.json --output comparison.json
# Compare with/without orphan connections
python lightrag/evaluation/compare_results.py results_without_orphans.json results_with_orphans.json
""",
)
parser.add_argument(
'baseline',
type=str,
help='Path to baseline results JSON file',
)
parser.add_argument(
'experiment',
type=str,
help='Path to experiment results JSON file',
)
parser.add_argument(
'--output',
'-o',
type=str,
default=None,
help='Output path for comparison JSON (optional)',
)
args = parser.parse_args()
baseline_path = Path(args.baseline)
experiment_path = Path(args.experiment)
# Validate files exist
if not baseline_path.exists():
logger.error('Error: Baseline file not found: %s', baseline_path)
sys.exit(1)
if not experiment_path.exists():
logger.error('Error: Experiment file not found: %s', experiment_path)
sys.exit(1)
# Run analysis
analysis = analyze_results(baseline_path, experiment_path)
# Print report
print_comparison_report(analysis)
# Save to file if requested
if args.output:
output_path = Path(args.output)
with open(output_path, 'w') as f:
json.dump(analysis, f, indent=2)
logger.info('Comparison saved to: %s', output_path)
# Exit with status based on verdict
if analysis['verdict'] in ('SIGNIFICANT_REGRESSION',):
sys.exit(1)
sys.exit(0)
if __name__ == '__main__':
main()