Remove legacy storage implementations and deprecated examples: - Delete FAISS, JSON, Memgraph, Milvus, MongoDB, Nano Vector DB, Neo4j, NetworkX, Qdrant, Redis storage backends - Remove Kubernetes deployment manifests and installation scripts - Delete unofficial examples for deprecated backends and offline deployment docs Streamline core infrastructure: - Consolidate storage layer to PostgreSQL-only implementation - Add full-text search caching with FTS cache module - Implement metrics collection and monitoring pipeline - Add explain and metrics API routes Modernize frontend and tooling: - Switch web UI to Bun with bun.lock, remove npm and pnpm lockfiles - Update Dockerfile for PostgreSQL-only deployment - Add Makefile for common development tasks - Update environment and configuration examples Enhance evaluation and testing capabilities: - Add prompt optimization with DSPy and auto-tuning - Implement ground truth regeneration and variant testing - Add prompt debugging and response comparison utilities - Expand test coverage with new integration scenarios Simplify dependencies and configuration: - Remove offline-specific requirement files - Update pyproject.toml with streamlined dependencies - Add Python version pinning with .python-version - Create project guidelines in CLAUDE.md and AGENTS.md
332 lines
11 KiB
Python
332 lines
11 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
A/B Test Results Comparator for RAGAS Evaluation
|
|
|
|
Compares two RAGAS evaluation result files to determine if a change
|
|
(e.g., orphan connections) improved or degraded retrieval quality.
|
|
|
|
Usage:
|
|
python lightrag/evaluation/compare_results.py baseline.json experiment.json
|
|
python lightrag/evaluation/compare_results.py results_a.json results_b.json --output comparison.json
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import math
|
|
import sys
|
|
from dataclasses import dataclass
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
from lightrag.utils import logger
|
|
|
|
|
|
@dataclass
|
|
class MetricComparison:
|
|
"""Comparison of a single metric between two runs."""
|
|
|
|
metric_name: str
|
|
baseline_value: float
|
|
experiment_value: float
|
|
absolute_change: float
|
|
relative_change_percent: float
|
|
improved: bool
|
|
significant: bool # > 5% change
|
|
|
|
|
|
def safe_float(value: Any, default: float = 0.0) -> float:
|
|
"""Safely convert a value to float, handling NaN."""
|
|
if value is None:
|
|
return default
|
|
try:
|
|
f = float(value)
|
|
if math.isnan(f):
|
|
return default
|
|
return f
|
|
except (ValueError, TypeError):
|
|
return default
|
|
|
|
|
|
def compare_metrics(baseline: dict, experiment: dict) -> list[MetricComparison]:
|
|
"""
|
|
Compare metrics between baseline and experiment.
|
|
|
|
Args:
|
|
baseline: Benchmark stats from baseline run
|
|
experiment: Benchmark stats from experiment run
|
|
|
|
Returns:
|
|
List of MetricComparison objects
|
|
"""
|
|
comparisons = []
|
|
|
|
baseline_avg = baseline.get('average_metrics', {})
|
|
experiment_avg = experiment.get('average_metrics', {})
|
|
|
|
metrics_to_compare = [
|
|
('faithfulness', 'Faithfulness'),
|
|
('answer_relevance', 'Answer Relevance'),
|
|
('context_recall', 'Context Recall'),
|
|
('context_precision', 'Context Precision'),
|
|
('ragas_score', 'RAGAS Score'),
|
|
]
|
|
|
|
for metric_key, metric_name in metrics_to_compare:
|
|
b_val = safe_float(baseline_avg.get(metric_key, 0))
|
|
e_val = safe_float(experiment_avg.get(metric_key, 0))
|
|
|
|
abs_change = e_val - b_val
|
|
rel_change = (abs_change / b_val * 100) if b_val > 0 else 0
|
|
|
|
comparisons.append(
|
|
MetricComparison(
|
|
metric_name=metric_name,
|
|
baseline_value=b_val,
|
|
experiment_value=e_val,
|
|
absolute_change=abs_change,
|
|
relative_change_percent=rel_change,
|
|
improved=abs_change > 0,
|
|
significant=abs(rel_change) > 5, # > 5% is significant
|
|
)
|
|
)
|
|
|
|
return comparisons
|
|
|
|
|
|
def analyze_results(baseline_path: Path, experiment_path: Path) -> dict:
|
|
"""
|
|
Perform comprehensive A/B analysis.
|
|
|
|
Args:
|
|
baseline_path: Path to baseline results JSON
|
|
experiment_path: Path to experiment results JSON
|
|
|
|
Returns:
|
|
Analysis results dictionary
|
|
"""
|
|
# Load results
|
|
with open(baseline_path) as f:
|
|
baseline = json.load(f)
|
|
with open(experiment_path) as f:
|
|
experiment = json.load(f)
|
|
|
|
baseline_stats = baseline.get('benchmark_stats', {})
|
|
experiment_stats = experiment.get('benchmark_stats', {})
|
|
|
|
# Compare metrics
|
|
comparisons = compare_metrics(baseline_stats, experiment_stats)
|
|
|
|
# Calculate overall verdict
|
|
improvements = sum(1 for c in comparisons if c.improved)
|
|
regressions = sum(1 for c in comparisons if not c.improved and c.absolute_change != 0)
|
|
significant_improvements = sum(1 for c in comparisons if c.improved and c.significant)
|
|
significant_regressions = sum(1 for c in comparisons if not c.improved and c.significant)
|
|
|
|
# Determine verdict
|
|
ragas_comparison = next((c for c in comparisons if c.metric_name == 'RAGAS Score'), None)
|
|
|
|
if ragas_comparison:
|
|
if ragas_comparison.improved and ragas_comparison.significant:
|
|
verdict = 'SIGNIFICANT_IMPROVEMENT'
|
|
verdict_description = f'RAGAS Score improved by {ragas_comparison.relative_change_percent:.1f}%'
|
|
elif ragas_comparison.improved:
|
|
verdict = 'MINOR_IMPROVEMENT'
|
|
verdict_description = f'RAGAS Score slightly improved by {ragas_comparison.relative_change_percent:.1f}%'
|
|
elif ragas_comparison.significant:
|
|
verdict = 'SIGNIFICANT_REGRESSION'
|
|
verdict_description = f'RAGAS Score regressed by {abs(ragas_comparison.relative_change_percent):.1f}%'
|
|
elif ragas_comparison.absolute_change == 0:
|
|
verdict = 'NO_CHANGE'
|
|
verdict_description = 'No measurable difference between runs'
|
|
else:
|
|
verdict = 'MINOR_REGRESSION'
|
|
verdict_description = (
|
|
f'RAGAS Score slightly regressed by {abs(ragas_comparison.relative_change_percent):.1f}%'
|
|
)
|
|
else:
|
|
verdict = 'UNKNOWN'
|
|
verdict_description = 'Could not determine RAGAS score comparison'
|
|
|
|
return {
|
|
'analysis_timestamp': datetime.now().isoformat(),
|
|
'baseline_file': str(baseline_path),
|
|
'experiment_file': str(experiment_path),
|
|
'verdict': verdict,
|
|
'verdict_description': verdict_description,
|
|
'summary': {
|
|
'metrics_improved': improvements,
|
|
'metrics_regressed': regressions,
|
|
'significant_improvements': significant_improvements,
|
|
'significant_regressions': significant_regressions,
|
|
},
|
|
'metrics': [
|
|
{
|
|
'name': c.metric_name,
|
|
'baseline': round(c.baseline_value, 4),
|
|
'experiment': round(c.experiment_value, 4),
|
|
'change': round(c.absolute_change, 4),
|
|
'change_percent': round(c.relative_change_percent, 2),
|
|
'improved': c.improved,
|
|
'significant': c.significant,
|
|
}
|
|
for c in comparisons
|
|
],
|
|
'baseline_summary': {
|
|
'total_tests': baseline_stats.get('total_tests', 0),
|
|
'successful_tests': baseline_stats.get('successful_tests', 0),
|
|
'success_rate': baseline_stats.get('success_rate', 0),
|
|
},
|
|
'experiment_summary': {
|
|
'total_tests': experiment_stats.get('total_tests', 0),
|
|
'successful_tests': experiment_stats.get('successful_tests', 0),
|
|
'success_rate': experiment_stats.get('success_rate', 0),
|
|
},
|
|
}
|
|
|
|
|
|
def print_comparison_report(analysis: dict):
|
|
"""Print a formatted comparison report to stdout."""
|
|
print('=' * 70)
|
|
print('A/B TEST COMPARISON REPORT')
|
|
print('=' * 70)
|
|
print(f'Baseline: {analysis["baseline_file"]}')
|
|
print(f'Experiment: {analysis["experiment_file"]}')
|
|
print('-' * 70)
|
|
|
|
# Verdict
|
|
verdict = analysis['verdict']
|
|
verdict_icon = {
|
|
'SIGNIFICANT_IMPROVEMENT': 'PASS',
|
|
'MINOR_IMPROVEMENT': 'PASS',
|
|
'NO_CHANGE': '~',
|
|
'MINOR_REGRESSION': 'WARN',
|
|
'SIGNIFICANT_REGRESSION': 'FAIL',
|
|
'UNKNOWN': '?',
|
|
}.get(verdict, '?')
|
|
|
|
print(f'\n[{verdict_icon}] VERDICT: {verdict}')
|
|
print(f' {analysis["verdict_description"]}')
|
|
|
|
# Metrics table
|
|
print('\n' + '-' * 70)
|
|
print(f'{"Metric":<20} {"Baseline":>10} {"Experiment":>10} {"Change":>10} {"Status":>10}')
|
|
print('-' * 70)
|
|
|
|
for metric in analysis['metrics']:
|
|
name = metric['name']
|
|
baseline = f'{metric["baseline"]:.4f}'
|
|
experiment = f'{metric["experiment"]:.4f}'
|
|
|
|
change = metric['change']
|
|
change_pct = metric['change_percent']
|
|
if change > 0:
|
|
change_str = f'+{change:.4f}'
|
|
status = f'+{change_pct:.1f}%'
|
|
elif change < 0:
|
|
change_str = f'{change:.4f}'
|
|
status = f'{change_pct:.1f}%'
|
|
else:
|
|
change_str = '0.0000'
|
|
status = '0.0%'
|
|
|
|
if metric['significant']:
|
|
status = f'[UP] {status}' if metric['improved'] else f'[DOWN] {status}'
|
|
else:
|
|
status = f' {status}'
|
|
|
|
print(f'{name:<20} {baseline:>10} {experiment:>10} {change_str:>10} {status:>10}')
|
|
|
|
print('-' * 70)
|
|
|
|
# Summary
|
|
summary = analysis['summary']
|
|
print(f'\nSummary: {summary["metrics_improved"]} improved, {summary["metrics_regressed"]} regressed')
|
|
print(
|
|
f' {summary["significant_improvements"]} significant improvements, {summary["significant_regressions"]} significant regressions'
|
|
)
|
|
|
|
# Test counts
|
|
b_summary = analysis['baseline_summary']
|
|
e_summary = analysis['experiment_summary']
|
|
print(
|
|
f'\nBaseline: {b_summary["successful_tests"]}/{b_summary["total_tests"]} tests ({b_summary["success_rate"]:.1f}% success)'
|
|
)
|
|
print(
|
|
f'Experiment: {e_summary["successful_tests"]}/{e_summary["total_tests"]} tests ({e_summary["success_rate"]:.1f}% success)'
|
|
)
|
|
|
|
print('=' * 70)
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description='Compare RAGAS evaluation results from two runs',
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog="""
|
|
Examples:
|
|
# Compare baseline vs experiment
|
|
python lightrag/evaluation/compare_results.py baseline.json experiment.json
|
|
|
|
# Save comparison to file
|
|
python lightrag/evaluation/compare_results.py baseline.json experiment.json --output comparison.json
|
|
|
|
# Compare with/without orphan connections
|
|
python lightrag/evaluation/compare_results.py results_without_orphans.json results_with_orphans.json
|
|
""",
|
|
)
|
|
|
|
parser.add_argument(
|
|
'baseline',
|
|
type=str,
|
|
help='Path to baseline results JSON file',
|
|
)
|
|
|
|
parser.add_argument(
|
|
'experiment',
|
|
type=str,
|
|
help='Path to experiment results JSON file',
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--output',
|
|
'-o',
|
|
type=str,
|
|
default=None,
|
|
help='Output path for comparison JSON (optional)',
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
baseline_path = Path(args.baseline)
|
|
experiment_path = Path(args.experiment)
|
|
|
|
# Validate files exist
|
|
if not baseline_path.exists():
|
|
logger.error('Error: Baseline file not found: %s', baseline_path)
|
|
sys.exit(1)
|
|
if not experiment_path.exists():
|
|
logger.error('Error: Experiment file not found: %s', experiment_path)
|
|
sys.exit(1)
|
|
|
|
# Run analysis
|
|
analysis = analyze_results(baseline_path, experiment_path)
|
|
|
|
# Print report
|
|
print_comparison_report(analysis)
|
|
|
|
# Save to file if requested
|
|
if args.output:
|
|
output_path = Path(args.output)
|
|
with open(output_path, 'w') as f:
|
|
json.dump(analysis, f, indent=2)
|
|
logger.info('Comparison saved to: %s', output_path)
|
|
|
|
# Exit with status based on verdict
|
|
if analysis['verdict'] in ('SIGNIFICANT_REGRESSION',):
|
|
sys.exit(1)
|
|
sys.exit(0)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|